Spaces:

hzxie
/

gaussian-city

Running on Zero

App Files Files Community

hzxie commited on Mar 5, 2025

Commit

83d5461

verified ·

0 Parent(s):

fix: reinitialize the repo.

Browse files

Files changed (45) hide show

.gitattributes +37 -0
.gitignore +183 -0
.gitmodules +3 -0
ARTICLE.md +25 -0
LICENSE +35 -0
README.md +17 -0
app.py +241 -0
assets/CENTERS.pkl +3 -0
assets/NYC-HghtFld.png +3 -0
assets/NYC-SegMap.png +3 -0
gaussiancity/__init__.py +0 -0
gaussiancity/extensions/__init__.py +0 -0
gaussiancity/extensions/diff_gaussian_rasterization/CMakeLists.txt +36 -0
gaussiancity/extensions/diff_gaussian_rasterization/LICENSE.md +83 -0
gaussiancity/extensions/diff_gaussian_rasterization/__init__.py +426 -0
gaussiancity/extensions/diff_gaussian_rasterization/bindings.cpp +19 -0
gaussiancity/extensions/diff_gaussian_rasterization/cuda_rasterizer/auxiliary.h +169 -0
gaussiancity/extensions/diff_gaussian_rasterization/cuda_rasterizer/backward.cu +622 -0
gaussiancity/extensions/diff_gaussian_rasterization/cuda_rasterizer/backward.h +41 -0
gaussiancity/extensions/diff_gaussian_rasterization/cuda_rasterizer/config.h +19 -0
gaussiancity/extensions/diff_gaussian_rasterization/cuda_rasterizer/forward.cu +376 -0
gaussiancity/extensions/diff_gaussian_rasterization/cuda_rasterizer/forward.h +43 -0
gaussiancity/extensions/diff_gaussian_rasterization/cuda_rasterizer/rasterizer.h +52 -0
gaussiancity/extensions/diff_gaussian_rasterization/cuda_rasterizer/rasterizer_impl.cu +339 -0
gaussiancity/extensions/diff_gaussian_rasterization/cuda_rasterizer/rasterizer_impl.h +70 -0
gaussiancity/extensions/diff_gaussian_rasterization/rasterize_points.cu +173 -0
gaussiancity/extensions/diff_gaussian_rasterization/rasterize_points.h +46 -0
gaussiancity/extensions/diff_gaussian_rasterization/setup.py +40 -0
gaussiancity/extensions/diff_gaussian_rasterization/third_party/glm +1 -0
gaussiancity/extensions/diff_gaussian_rasterization/third_party/stbi_image_write.h +1724 -0
gaussiancity/extensions/grid_encoder/__init__.py +193 -0
gaussiancity/extensions/grid_encoder/bindings.cpp +40 -0
gaussiancity/extensions/grid_encoder/grid_encoder_ext.cu +605 -0
gaussiancity/extensions/grid_encoder/setup.py +39 -0
gaussiancity/extensions/voxlib/__init__.py +12 -0
gaussiancity/extensions/voxlib/bindings.cpp +41 -0
gaussiancity/extensions/voxlib/maps_to_volume.cu +142 -0
gaussiancity/extensions/voxlib/points_to_volume.cu +79 -0
gaussiancity/extensions/voxlib/ray_voxel_intersection.cu +332 -0
gaussiancity/extensions/voxlib/setup.py +32 -0
gaussiancity/extensions/voxlib/voxlib_common.h +83 -0
gaussiancity/generator.py +536 -0
gaussiancity/inference.py +582 -0
gaussiancity/pt_v3.py +1344 -0
requirements.txt +14 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,37 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+*.png filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.whl filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,183 @@

+# ---> Python
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+idea/
+# VSCode
+.vscode/
+# ---> JupyterNotebooks
+# gitignore template for Jupyter Notebooks
+# website: http://jupyter.org/
+.ipynb_checkpoints
+*/.ipynb_checkpoints/*
+# IPython
+profile_default/
+ipython_config.py
+# User data
+configs/
+data/
+notebooks/
+output/
+flagged/
+*.pth

.gitmodules ADDED Viewed

	@@ -0,0 +1,3 @@

+[submodule "gaussiancity/extensions/diff_gaussian_rasterization/third_party/glm"]
+	path = gaussiancity/extensions/diff_gaussian_rasterization/third_party/glm
+	url = https://github.com/g-truc/glm.git

ARTICLE.md ADDED Viewed

	@@ -0,0 +1,25 @@

+### Citation 📝
+If our work is useful for your research, please consider citing:
+```bibtex
+@inproceedings{xie2025gaussiancity,
+  title     = {Generative Gaussian Splatting for Unbounded 3{D} City Generation},
+  author    = {Xie, Haozhe and
+               Chen, Zhaoxi and
+               Hong, Fangzhou and
+               Liu, Ziwei},
+  booktitle = {CVPR},
+  year      = {2025}
+}
+```
+### License 📋
+This project is licensed under [S-Lab License 1.0](https://huggingface.co/hzxie/city-dreamer/blob/main/LICENSE).
+Redistribution and use for non-commercial purposes should follow this license.
+![Counter](https://api.infinitescript.com/badgen/count?name=hzxie/CityDreamer&ltext=Visitors&color=f97316)
+---

LICENSE ADDED Viewed

	@@ -0,0 +1,35 @@

+S-Lab License 1.0
+Copyright 2025 S-Lab
+Redistribution and use for non-commercial purpose in source and
+binary forms, with or without modification, are permitted provided
+that the following conditions are met:
+1. Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in
+   the documentation and/or other materials provided with the
+   distribution.
+3. Neither the name of the copyright holder nor the names of its
+   contributors may be used to endorse or promote products derived
+   from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+In the event that redistribution and/or use for commercial purpose in
+source or binary forms, with or without modification is required,
+please contact the contributor(s) of the work.

README.md ADDED Viewed

	@@ -0,0 +1,17 @@

+---
+title: GaussianCity
+emoji: 🏙️
+colorFrom: red
+colorTo: blue
+sdk: gradio
+sdk_version: 5.20.0
+app_file: app.py
+pinned: false
+---
+Official demo for **[Generative Gaussian Splatting for Unbounded 3D City Generation](https://github.com/hzxie/GaussianCity) (CVPR 2025).**
+- 🔥 GaussianCity is a unbounded 3D city generator based on 3D Gaussian Splatting.
+- 🤗 Try GaussianCity to generate photolistic 3D cities.
+- ⚠️  Due to the limited computational resources at Hugging Face, this demo only generates **A SINGLE IMAGE** based on the New York City layout.

app.py ADDED Viewed

	@@ -0,0 +1,241 @@

+# -*- coding: utf-8 -*-
+#
+# @File:   app.py
+# @Author: Haozhe Xie
+# @Date:   2024-03-02 16:30:00
+# @Last Modified by: Haozhe Xie
+# @Last Modified at: 2024-10-13 15:36:50
+# @Email:  root@haozhexie.com
+import gradio as gr
+import logging
+import numpy as np
+import os
+import pickle
+import ssl
+import subprocess
+import sys
+import urllib.request
+from PIL import Image
+# Reinstall PyTorch with CUDA 11.8 (Default version is 12.1)
+# subprocess.call(
+#     [
+#         "pip",
+#         "install",
+#         "torch==2.2.2",
+#         "torchvision==0.17.2",
+#         "--index-url",
+#         "https://download.pytorch.org/whl/cu118",
+#     ]
+# )
+import torch
+# Create a dummy decorator for Non-ZeroGPU environments
+if os.environ.get("SPACES_ZERO_GPU") is not None:
+    import spaces
+else:
+    class spaces:
+        @staticmethod
+        def GPU(func):
+            # This is a dummy wrapper that just calls the function.
+            def wrapper(*args, **kwargs):
+                return func(*args, **kwargs)
+            return wrapper
+# Fix: ssl.SSLCertVerificationError: [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed
+ssl._create_default_https_context = ssl._create_unverified_context
+# Import GaussianCity modules
+sys.path.append(os.path.join(os.path.dirname(__file__), "gaussiancity"))
+def _get_output(cmd):
+    try:
+        return subprocess.check_output(cmd).decode("utf-8")
+    except Exception as ex:
+        logging.exception(ex)
+    return None
+def install_cuda_toolkit():
+    # CUDA_TOOLKIT_URL = "https://developer.download.nvidia.com/compute/cuda/11.8.0/local_installers/cuda_11.8.0_520.61.05_linux.run"
+    CUDA_TOOLKIT_URL = "https://developer.download.nvidia.com/compute/cuda/12.2.0/local_installers/cuda_12.2.0_535.54.03_linux.run"
+    CUDA_TOOLKIT_FILE = "/tmp/%s" % os.path.basename(CUDA_TOOLKIT_URL)
+    subprocess.call(["wget", "-q", CUDA_TOOLKIT_URL, "-O", CUDA_TOOLKIT_FILE])
+    subprocess.call(["chmod", "+x", CUDA_TOOLKIT_FILE])
+    subprocess.call([CUDA_TOOLKIT_FILE, "--silent", "--toolkit"])
+    os.environ["CUDA_HOME"] = "/usr/local/cuda"
+    os.environ["PATH"] = "%s/bin:%s" % (os.environ["CUDA_HOME"], os.environ["PATH"])
+    os.environ["LD_LIBRARY_PATH"] = "%s/lib:%s" % (
+        os.environ["CUDA_HOME"],
+        "" if "LD_LIBRARY_PATH" not in os.environ else os.environ["LD_LIBRARY_PATH"],
+    )
+    # Fix: arch_list[-1] += '+PTX'; IndexError: list index out of range
+    os.environ["TORCH_CUDA_ARCH_LIST"] = "8.0;8.6"
+def setup_runtime_env():
+    logging.info("Python Version: %s" % _get_output(["python", "--version"]))
+    logging.info("CUDA Version: %s" % _get_output(["nvcc", "--version"]))
+    logging.info("GCC Version: %s" % _get_output(["gcc", "--version"]))
+    logging.info("CUDA is available: %s" % torch.cuda.is_available())
+    logging.info("CUDA Device Capability: %s" % (torch.cuda.get_device_capability(),))
+    # Install Pre-compiled CUDA extensions (Not working)
+    # Ref: https://huggingface.co/spaces/zero-gpu-explorers/README/discussions/110
+    #
+    # ext_dir = os.path.join(os.path.dirname(__file__), "wheels")
+    # for e in os.listdir(ext_dir):
+    #     logging.info("Installing Extensions from %s" % e)
+    #     subprocess.call(
+    #         ["pip", "install", os.path.join(ext_dir, e)], stderr=subprocess.STDOUT
+    #     )
+    # Compile CUDA extensions
+    ext_dir = os.path.join(os.path.dirname(__file__), "gaussiancity", "extensions")
+    for e in os.listdir(ext_dir):
+        if os.path.isdir(os.path.join(ext_dir, e)):
+            subprocess.call(["pip", "install", "."], cwd=os.path.join(ext_dir, e))
+    logging.info("Installed Python Packages: %s" % _get_output(["pip", "list"]))
+def get_models(file_name):
+    import gaussiancity.generator
+    if not os.path.exists(file_name):
+        urllib.request.urlretrieve(
+            "https://huggingface.co/hzxie/gaussian-city/resolve/main/%s" % file_name,
+            file_name,
+        )
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    ckpt = torch.load(file_name, map_location=torch.device(device), weights_only=False)
+    model = gaussiancity.generator.Generator(
+        ckpt["cfg"].NETWORK.GAUSSIAN,
+        n_classes=ckpt["cfg"].DATASETS.GOOGLE_EARTH.N_CLASSES,
+        proj_size=ckpt["cfg"].DATASETS.GOOGLE_EARTH.PROJ_SIZE,
+    )
+    if torch.cuda.is_available():
+        model = torch.nn.DataParallel(model).cuda().eval()
+    model.load_state_dict(ckpt["gaussian_g"], strict=False)
+    return model
+def get_city_layout():
+    import gaussiancity.inference
+    layout = None
+    if os.path.exists("assets/NYC.pkl"):
+        with open("assets/NYC.pkl", "rb") as fp:
+            layout = pickle.load(fp)
+    else:
+        td_hf = np.array(Image.open("assets/NYC-HghtFld.png")).astype(np.int32)
+        # Fix: nonzero is not supported for tensors with more than INT_MAX elements
+        td_hf[td_hf > 500] = 500
+        bu_hf = np.zeros_like(td_hf)
+        seg_map = np.array(Image.open("assets/NYC-SegMap.png").convert("P")).astype(
+            np.int32
+        )
+        ins_map = gaussiancity.inference.get_instance_seg_map(seg_map.copy())
+        pts_map = gaussiancity.inference.get_point_map(seg_map)
+        layout = {
+            "TD_HF": td_hf,
+            "BU_HF": bu_hf,
+            "SEG": seg_map,
+            "INS": ins_map,
+            "PTS": pts_map,
+        }
+        with open("assets/NYC.pkl", "wb") as fp:
+            pickle.dump(layout, fp)
+    centers = None
+    if os.path.exists("assets/CENTERS.pkl"):
+        with open("assets/CENTERS.pkl", "rb") as fp:
+            centers = pickle.load(fp)
+    else:
+        centers = gaussiancity.inference.get_centers(layout["INS"], layout["TD_HF"])
+        with open("assets/CENTERS.pkl", "wb") as fp:
+            pickle.dump(centers, fp)
+    layout["CTR"] = centers
+    return layout
+@spaces.GPU
+def get_generated_city(radius, altitude, azimuth, map_center):
+    logging.info("CUDA is available: %s" % torch.cuda.is_available())
+    logging.info("PyTorch is built with CUDA: %s" % torch.version.cuda)
+    # The import must be done after CUDA extension compilation
+    import gaussiancity.inference
+    return gaussiancity.inference.generate_city(
+        get_generated_city.fgm.to("cuda"),
+        get_generated_city.bgm.to("cuda"),
+        get_generated_city.city_layout,
+        map_center,
+        map_center,
+        radius,
+        altitude,
+        azimuth,
+    )
+def main(debug):
+    title = "Generative Gaussian Splatting for Unbounded 3D City Generation"
+    with open("README.md", "r") as f:
+        markdown = f.read()
+        desc = markdown[markdown.rfind("---") + 3 :]
+    with open("ARTICLE.md", "r") as f:
+        arti = f.read()
+    app = gr.Interface(
+        get_generated_city,
+        [
+            gr.Slider(256, 768, value=512, step=4, label="Camera Radius (m)"),
+            gr.Slider(256, 768, value=512, step=4, label="Camera Altitude (m)"),
+            gr.Slider(0, 360, value=60, step=5, label="Camera Azimuth (°)"),
+            gr.Slider(1024, 7168, value=3570, step=4, label="Map Center (px)"),
+        ],
+        [gr.Image(type="numpy", label="Generated City")],
+        title=title,
+        description=desc,
+        article=arti,
+        flagging_mode="never",
+    )
+    app.queue(api_open=False)
+    app.launch(debug=debug)
+if __name__ == "__main__":
+    logging.basicConfig(
+        format="[%(levelname)s] %(asctime)s %(message)s", level=logging.INFO
+    )
+    logging.info("Environment Variables: %s" % os.environ)
+    if _get_output(["nvcc", "--version"]) is None:
+        logging.info("Installing CUDA toolkit...")
+        install_cuda_toolkit()
+    else:
+        logging.info("Detected CUDA: %s" % _get_output(["nvcc", "--version"]))
+    logging.info("Compiling CUDA extensions...")
+    # setup_runtime_env()
+    logging.info("Downloading pretrained models...")
+    fgm = get_models("GaussianCity-Fgnd.pth")
+    bgm = get_models("GaussianCity-Bgnd.pth")
+    get_generated_city.fgm = fgm
+    get_generated_city.bgm = bgm
+    logging.info("Loading New York city layout to RAM...")
+    city_layout = get_city_layout()
+    get_generated_city.city_layout = city_layout
+    logging.info("Starting the main application...")
+    main(os.getenv("DEBUG") == "1")

assets/CENTERS.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cad871bfb3997485a6d464c1464c2a551b601a4444913b9ec808530d093eefd8
+size 728474

assets/NYC-HghtFld.png ADDED Viewed

Git LFS Details

SHA256: 51bcb2d4b097e1307e254427dbf8ec05772ff8e833a5d53993c7380188214ba9
Pointer size: 132 Bytes
Size of remote file: 5.29 MB

assets/NYC-SegMap.png ADDED Viewed

Git LFS Details

SHA256: 0e6f34f802829f97462885ab3a07b7720d7eb18bf15f810e683c89c8d53c3b6d
Pointer size: 132 Bytes
Size of remote file: 3 MB

gaussiancity/__init__.py ADDED Viewed

File without changes

gaussiancity/extensions/__init__.py ADDED Viewed

File without changes

gaussiancity/extensions/diff_gaussian_rasterization/CMakeLists.txt ADDED Viewed

	@@ -0,0 +1,36 @@

+#
+# Copyright (C) 2023, Inria
+# GRAPHDECO research group, https://team.inria.fr/graphdeco
+# All rights reserved.
+#
+# This software is free for non-commercial, research and evaluation use
+# under the terms of the LICENSE.md file.
+#
+# For inquiries contact  george.drettakis@inria.fr
+#
+cmake_minimum_required(VERSION 3.20)
+project(DiffRast LANGUAGES CUDA CXX)
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_EXTENSIONS OFF)
+set(CMAKE_CUDA_STANDARD 17)
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
+add_library(CudaRasterizer
+	cuda_rasterizer/backward.h
+	cuda_rasterizer/backward.cu
+	cuda_rasterizer/forward.h
+	cuda_rasterizer/forward.cu
+	cuda_rasterizer/auxiliary.h
+	cuda_rasterizer/rasterizer_impl.cu
+	cuda_rasterizer/rasterizer_impl.h
+	cuda_rasterizer/rasterizer.h
+)
+set_target_properties(CudaRasterizer PROPERTIES CUDA_ARCHITECTURES "70;75;86")
+target_include_directories(CudaRasterizer PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/cuda_rasterizer)
+target_include_directories(CudaRasterizer PRIVATE third_party/glm ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})

gaussiancity/extensions/diff_gaussian_rasterization/LICENSE.md ADDED Viewed

	@@ -0,0 +1,83 @@

+Gaussian-Splatting License
+===========================
+**Inria** and **the Max Planck Institut for Informatik (MPII)** hold all the ownership rights on the *Software* named **gaussian-splatting**.
+The *Software* is in the process of being registered with the Agence pour la Protection des
+Programmes (APP).
+The *Software* is still being developed by the *Licensor*.
+*Licensor*'s goal is to allow the research community to use, test and evaluate
+the *Software*.
+## 1.  Definitions
+*Licensee* means any person or entity that uses the *Software* and distributes
+its *Work*.
+*Licensor* means the owners of the *Software*, i.e Inria and MPII
+*Software* means the original work of authorship made available under this
+License ie gaussian-splatting.
+*Work* means the *Software* and any additions to or derivative works of the
+*Software* that are made available under this License.
+## 2.  Purpose
+This license is intended to define the rights granted to the *Licensee* by
+Licensors under the *Software*.
+## 3.  Rights granted
+For the above reasons Licensors have decided to distribute the *Software*.
+Licensors grant non-exclusive rights to use the *Software* for research purposes
+to research users (both academic and industrial), free of charge, without right
+to sublicense.. The *Software* may be used "non-commercially", i.e., for research
+and/or evaluation purposes only.
+Subject to the terms and conditions of this License, you are granted a
+non-exclusive, royalty-free, license to reproduce, prepare derivative works of,
+publicly display, publicly perform and distribute its *Work* and any resulting
+derivative works in any form.
+## 4.  Limitations
+**4.1 Redistribution.** You may reproduce or distribute the *Work* only if (a) you do
+so under this License, (b) you include a complete copy of this License with
+your distribution, and (c) you retain without modification any copyright,
+patent, trademark, or attribution notices that are present in the *Work*.
+**4.2 Derivative Works.** You may specify that additional or different terms apply
+to the use, reproduction, and distribution of your derivative works of the *Work*
+("Your Terms") only if (a) Your Terms provide that the use limitation in
+Section 2 applies to your derivative works, and (b) you identify the specific
+derivative works that are subject to Your Terms. Notwithstanding Your Terms,
+this License (including the redistribution requirements in Section 3.1) will
+continue to apply to the *Work* itself.
+**4.3** Any other use without of prior consent of Licensors is prohibited. Research
+users explicitly acknowledge having received from Licensors all information
+allowing to appreciate the adequacy between of the *Software* and their needs and
+to undertake all necessary precautions for its execution and use.
+**4.4** The *Software* is provided both as a compiled library file and as source
+code. In case of using the *Software* for a publication or other results obtained
+through the use of the *Software*, users are strongly encouraged to cite the
+corresponding publications as explained in the documentation of the *Software*.
+## 5.  Disclaimer
+THE USER CANNOT USE, EXPLOIT OR DISTRIBUTE THE *SOFTWARE* FOR COMMERCIAL PURPOSES
+WITHOUT PRIOR AND EXPLICIT CONSENT OF LICENSORS. YOU MUST CONTACT INRIA FOR ANY
+UNAUTHORIZED USE: stip-sophia.transfert@inria.fr . ANY SUCH ACTION WILL
+CONSTITUTE A FORGERY. THIS *SOFTWARE* IS PROVIDED "AS IS" WITHOUT ANY WARRANTIES
+OF ANY NATURE AND ANY EXPRESS OR IMPLIED WARRANTIES, WITH REGARDS TO COMMERCIAL
+USE, PROFESSIONNAL USE, LEGAL OR NOT, OR OTHER, OR COMMERCIALISATION OR
+ADAPTATION. UNLESS EXPLICITLY PROVIDED BY LAW, IN NO EVENT, SHALL INRIA OR THE
+AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
+GOODS OR SERVICES, LOSS OF USE, DATA, OR PROFITS OR BUSINESS INTERRUPTION)
+HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING FROM, OUT OF OR
+IN CONNECTION WITH THE *SOFTWARE* OR THE USE OR OTHER DEALINGS IN THE *SOFTWARE*.

gaussiancity/extensions/diff_gaussian_rasterization/__init__.py ADDED Viewed

	@@ -0,0 +1,426 @@

+# -*- coding: utf-8 -*-
+#
+# @File:   __init__.py
+# @Author: Inria <george.drettakis@inria.fr>
+# @Date:   2024-01-31 19:07:01
+# @Last Modified by: Haozhe Xie
+# @Last Modified at: 2024-05-01 14:14:49
+# @Email:  root@haozhexie.com
+import math
+import numpy as np
+import scipy.spatial.transform
+import torch
+import typing
+import diff_gaussian_rasterization_ext as dgr_ext
+class RasterizeGaussiansFunction(torch.autograd.Function):
+    @staticmethod
+    def _cpu_deep_copy_tuple(input_tuple):
+        copied_tensors = [
+            item.cpu().clone() if isinstance(item, torch.Tensor) else item
+            for item in input_tuple
+        ]
+        return tuple(copied_tensors)
+    @staticmethod
+    def forward(
+        ctx,
+        means3D,
+        means2D,
+        sh,
+        colors_precomp,
+        opacities,
+        scales,
+        rotations,
+        cov3Ds_precomp,
+        raster_settings,
+    ):
+        # Restructure arguments the way that the C++ lib expects them
+        args = (
+            raster_settings.bg,
+            means3D,
+            colors_precomp,
+            opacities,
+            scales,
+            rotations,
+            raster_settings.scale_modifier,
+            cov3Ds_precomp,
+            raster_settings.view_matrix,
+            raster_settings.proj_matrix,
+            raster_settings.tanfovx,
+            raster_settings.tanfovy,
+            raster_settings.img_h,
+            raster_settings.img_w,
+            sh,
+            raster_settings.sh_degree,
+            raster_settings.campos,
+            raster_settings.prefiltered,
+            raster_settings.debug,
+        )
+        # Invoke C++/CUDA rasterizer
+        if raster_settings.debug:
+            cpu_args = RasterizeGaussiansFunction._cpu_deep_copy_tuple(
+                input_tuple=args
+            )  # Copy them before they can be corrupted
+            try:
+                (
+                    num_rendered,
+                    color,
+                    radii,
+                    geom_buffer,
+                    binning_buffer,
+                    img_buffer,
+                ) = dgr_ext.rasterize_gaussians(*args)
+            except Exception as ex:
+                torch.save(cpu_args, "snapshot_fw.dump")
+                print(
+                    "\nAn error occured in forward. Please forward snapshot_fw.dump for debugging."
+                )
+                raise ex
+        else:
+            (
+                num_rendered,
+                color,
+                radii,
+                geom_buffer,
+                binning_buffer,
+                img_buffer,
+            ) = dgr_ext.rasterize_gaussians(*args)
+        # Keep relevant tensors for backward
+        ctx.raster_settings = raster_settings
+        ctx.num_rendered = num_rendered
+        ctx.save_for_backward(
+            colors_precomp,
+            means3D,
+            scales,
+            rotations,
+            cov3Ds_precomp,
+            radii,
+            sh,
+            geom_buffer,
+            binning_buffer,
+            img_buffer,
+        )
+        return color, radii
+    @staticmethod
+    def backward(ctx, grad_out_color, _):
+        # Restore necessary values from context
+        num_rendered = ctx.num_rendered
+        raster_settings = ctx.raster_settings
+        (
+            colors_precomp,
+            means3D,
+            scales,
+            rotations,
+            cov3Ds_precomp,
+            radii,
+            sh,
+            geom_buffer,
+            binning_buffer,
+            img_buffer,
+        ) = ctx.saved_tensors
+        # Restructure args as C++ method expects them
+        args = (
+            raster_settings.bg,
+            means3D,
+            radii,
+            colors_precomp,
+            scales,
+            rotations,
+            raster_settings.scale_modifier,
+            cov3Ds_precomp,
+            raster_settings.view_matrix,
+            raster_settings.proj_matrix,
+            raster_settings.tanfovx,
+            raster_settings.tanfovy,
+            grad_out_color,
+            sh,
+            raster_settings.sh_degree,
+            raster_settings.campos,
+            geom_buffer,
+            num_rendered,
+            binning_buffer,
+            img_buffer,
+            raster_settings.debug,
+        )
+        # Compute gradients for relevant tensors by invoking backward method
+        if raster_settings.debug:
+            cpu_args = RasterizeGaussiansFunction._cpu_deep_copy_tuple(
+                input_tuple=args
+            )  # Copy them before they can be corrupted
+            try:
+                (
+                    grad_means2D,
+                    grad_colors_precomp,
+                    grad_opacities,
+                    grad_means3D,
+                    grad_cov3Ds_precomp,
+                    grad_sh,
+                    grad_scales,
+                    grad_rotations,
+                ) = dgr_ext.rasterize_gaussians_backward(*args)
+            except Exception as ex:
+                torch.save(cpu_args, "snapshot_bw.dump")
+                print(
+                    "\nAn error occured in backward. Writing snapshot_bw.dump for debugging.\n"
+                )
+                raise ex
+        else:
+            (
+                grad_means2D,
+                grad_colors_precomp,
+                grad_opacities,
+                grad_means3D,
+                grad_cov3Ds_precomp,
+                grad_sh,
+                grad_scales,
+                grad_rotations,
+            ) = dgr_ext.rasterize_gaussians_backward(*args)
+        grads = (
+            grad_means3D,
+            grad_means2D,
+            grad_sh,
+            grad_colors_precomp,
+            grad_opacities,
+            grad_scales,
+            grad_rotations,
+            grad_cov3Ds_precomp,
+            None,
+        )
+        return grads
+class GaussianRasterizationSettings(typing.NamedTuple):
+    img_h: int
+    img_w: int
+    tanfovx: float
+    tanfovy: float
+    bg: torch.Tensor
+    scale_modifier: float
+    view_matrix: torch.Tensor
+    proj_matrix: torch.Tensor
+    sh_degree: int
+    campos: torch.Tensor
+    prefiltered: bool
+    debug: bool
+class GaussianRasterizer(torch.nn.Module):
+    def __init__(self, raster_settings):
+        super(GaussianRasterizer, self).__init__()
+        self.raster_settings = raster_settings
+    def forward(
+        self,
+        means3D,
+        means2D,
+        opacities,
+        shs=None,
+        colors_precomp=None,
+        scales=None,
+        rotations=None,
+        cov3D_precomp=None,
+    ):
+        raster_settings = self.raster_settings
+        if (shs is None and colors_precomp is None) or (
+            shs is not None and colors_precomp is not None
+        ):
+            raise Exception(
+                "Please provide excatly one of either SHs or precomputed colors!"
+            )
+        if ((scales is None or rotations is None) and cov3D_precomp is None) or (
+            (scales is not None or rotations is not None) and cov3D_precomp is not None
+        ):
+            raise Exception(
+                "Please provide exactly one of either scale/rotation pair or precomputed 3D covariance!"
+            )
+        if shs is None:
+            shs = torch.Tensor([])
+        if colors_precomp is None:
+            colors_precomp = torch.Tensor([])
+        if scales is None:
+            scales = torch.Tensor([])
+        if rotations is None:
+            rotations = torch.Tensor([])
+        if cov3D_precomp is None:
+            cov3D_precomp = torch.Tensor([])
+        # Invoke C++/CUDA rasterization routine
+        return RasterizeGaussiansFunction.apply(
+            means3D,
+            means2D,
+            shs,
+            colors_precomp,
+            opacities,
+            scales,
+            rotations,
+            cov3D_precomp,
+            raster_settings,
+        )
+class GaussianRasterizerWrapper(torch.nn.Module):
+    # Carving flowers on a mountain of dung code.
+    #
+    # This class is a wrapper for the GaussianRasterizer class.
+    # It is used to port for the GaussianCity project.
+    def __init__(
+        self,
+        K,
+        sensor_size,
+        flip_lr=True,
+        flip_ud=False,
+        z_near=0.01,
+        z_far=50000.0,
+        device=torch.device("cuda"),
+    ):
+        super(GaussianRasterizerWrapper, self).__init__()
+        self.flip_lr = flip_lr
+        self.flip_ud = flip_ud
+        self.z_near = z_near
+        self.z_far = z_far
+        self.device = device
+        # Shared camera parameters
+        self.K = K
+        self.sensor_size = sensor_size
+        self.fov_x, self.fov_y = self._intrinsic_to_fov()
+        self.P = self._get_projection_matrix()
+    def get_gaussian_rasterizer(self, cam_position, cam_quaternion):
+        # cam_position in (tx, ty, tz)
+        # cam_quaternion in (qx, qy, qz, qw)
+        return GaussianRasterizer(
+            raster_settings=self._get_gaussian_rasterization_settings(
+                cam_position, cam_quaternion
+            )
+        )
+    def forward(
+        self, points, cam_position=None, cam_quaternion=None, gaussian_rasterizer=None
+    ):
+        # points: [N, M], M -> 0:3 xyz, 3:4 opacity, 4:7 scale, 7:11 rotation, 11:14 rgbs
+        _, M = points.shape
+        assert M == 14, "The input tensor should have 14 channels."
+        if gaussian_rasterizer is None:
+            gaussian_rasterizer = self.get_gaussian_rasterizer(
+                cam_position, cam_quaternion
+            )
+        return self._get_gaussian_rasterization(points, gaussian_rasterizer)
+    def _intrinsic_to_fov(self):
+        # graphdeco-inria/gaussian-splatting/utils/graphics_utils.py#L76
+        fx, fy = self.K[0, 0], self.K[1, 1]
+        fov_x = 2 * np.arctan2(self.sensor_size[0], (2 * fx))
+        fov_y = 2 * np.arctan2(self.sensor_size[1], (2 * fy))
+        return fov_x, fov_y
+    def _get_projection_matrix(self):
+        fx = self.K[0, 0]
+        fy = self.K[1, 1]
+        cx = self.K[0, 2]
+        cy = self.K[1, 2]
+        P = np.zeros((4, 4), dtype=np.float32)
+        P[0, 0] = 2.0 * fx / self.sensor_size[0]
+        P[1, 1] = 2.0 * fy / self.sensor_size[1]
+        P[0, 2] = (2.0 * cx / self.sensor_size[0]) - 1.0
+        P[1, 2] = (2.0 * cy / self.sensor_size[1]) - 1.0
+        P[2, 2] = -(self.z_far + self.z_near) / (self.z_far - self.z_near)
+        P[3, 2] = -1.0
+        P[2, 3] = -2.0 * self.z_far * self.z_near / (self.z_far - self.z_near)
+        return torch.from_numpy(P).to(self.device)
+    def _get_w2c_matrix(self, cam_position, cam_quaternion):
+        if type(cam_position) is torch.Tensor:
+            cam_position = cam_position.cpu().numpy()
+        if type(cam_quaternion) is torch.Tensor:
+            cam_quaternion = cam_quaternion.cpu().numpy()
+        R = scipy.spatial.transform.Rotation.from_quat(cam_quaternion).as_matrix()
+        # look_at = cam_position + R[:3, 0]
+        R = R[:, [1, 2, 0]]  # [F|R|U] -> [R|U|F]
+        # graphdeco-inria/gaussian-splatting/blob/main/scene/cameras.py#L31
+        # The w2c matrix
+        Rt = np.zeros((4, 4), dtype=np.float32)
+        Rt[:3, :3] = R.transpose()
+        Rt[:3, [3]] = -R.transpose() @ cam_position[:, None]
+        Rt[3, 3] = 1.0
+        # The c2w matrix
+        # Rt[:3, :3] = R
+        # Rt[:3, 3] = cam_position
+        # Rt[3, 3] = 1.0
+        return torch.from_numpy(Rt).to(self.device)
+    def _world_to_pixel(self, world_coords, w2c):
+        # NOTE: The function is used to debug whether the w2c matrix is correct.
+        # Convert world coordinates to camera coordinates using the inverse of w2c
+        camera_coords = np.dot(w2c[:3, :3], world_coords) + w2c[:3, 3]
+        # camera_coords = np.dot(np.linalg.inv(c2w[:3, :3]), (world_coords- w2c[:3, 3]))
+        # Apply the camera intrinsic matrix K to obtain normalized image coordinates
+        homogeneous_coords = np.dot(self.K, camera_coords)
+        # Normalize homogeneous coordinates
+        normalized_coords = homogeneous_coords / homogeneous_coords[2]
+        # Convert normalized coordinates to pixel coordinates
+        return normalized_coords[:2].astype(int)
+    def _get_gaussian_rasterization_settings(self, cam_position, cam_quaternion):
+        BG_COLOR = torch.tensor(
+            [0.0, 0.0, 0.0], dtype=torch.float32, device=self.device
+        )
+        w2c = self._get_w2c_matrix(cam_position, cam_quaternion).transpose(0, 1)
+        prj_mtx = self.P.transpose(0, 1)
+        return GaussianRasterizationSettings(
+            img_h=self.sensor_size[1],
+            img_w=self.sensor_size[0],
+            tanfovx=math.tan(self.fov_x * 0.5),
+            tanfovy=math.tan(self.fov_y * 0.5),
+            bg=BG_COLOR,
+            scale_modifier=1.0,
+            view_matrix=w2c,
+            proj_matrix=w2c @ prj_mtx,
+            sh_degree=0,
+            campos=w2c.inverse()[3, :3],
+            prefiltered=False,
+            debug=False,
+        )
+    def _get_gaussian_rasterization(self, points, rasterizer):
+        xyz = points[:, 0:3]
+        opacity = points[:, 3:4]
+        scales = points[:, 4:7]
+        quaternion = points[:, 7:11]
+        rgbs = points[:, 11:]
+        rendered_image, _ = rasterizer(
+            means3D=xyz,
+            means2D=torch.zeros_like(xyz, dtype=torch.float32, device=self.device),
+            shs=None,
+            colors_precomp=rgbs,
+            opacities=opacity,
+            scales=scales,
+            rotations=quaternion,
+            cov3D_precomp=None,
+        )
+        if self.flip_lr:
+            rendered_image = torch.flip(rendered_image, dims=[2])
+        if self.flip_ud:
+            rendered_image = torch.flip(rendered_image, dims=[1])
+        return rendered_image

gaussiancity/extensions/diff_gaussian_rasterization/bindings.cpp ADDED Viewed

	@@ -0,0 +1,19 @@

+/*
+ * Copyright (C) 2023, Inria
+ * GRAPHDECO research group, https://team.inria.fr/graphdeco
+ * All rights reserved.
+ *
+ * This software is free for non-commercial, research and evaluation use
+ * under the terms of the LICENSE.md file.
+ *
+ * For inquiries contact  george.drettakis@inria.fr
+ */
+#include "rasterize_points.h"
+#include <torch/extension.h>
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("rasterize_gaussians", &RasterizeGaussiansCUDA);
+  m.def("rasterize_gaussians_backward", &RasterizeGaussiansBackwardCUDA);
+  m.def("mark_visible", &markVisible);
+}

gaussiancity/extensions/diff_gaussian_rasterization/cuda_rasterizer/auxiliary.h ADDED Viewed

	@@ -0,0 +1,169 @@

+/*
+ * Copyright (C) 2023, Inria
+ * GRAPHDECO research group, https://team.inria.fr/graphdeco
+ * All rights reserved.
+ *
+ * This software is free for non-commercial, research and evaluation use
+ * under the terms of the LICENSE.md file.
+ *
+ * For inquiries contact  george.drettakis@inria.fr
+ */
+#ifndef CUDA_RASTERIZER_AUXILIARY_H_INCLUDED
+#define CUDA_RASTERIZER_AUXILIARY_H_INCLUDED
+#include "config.h"
+#include "stdio.h"
+#define BLOCK_SIZE (BLOCK_X * BLOCK_Y)
+#define NUM_WARPS (BLOCK_SIZE / 32)
+// Spherical harmonics coefficients
+__device__ const float SH_C0 = 0.28209479177387814f;
+__device__ const float SH_C1 = 0.4886025119029199f;
+__device__ const float SH_C2[] = {1.0925484305920792f, -1.0925484305920792f,
+                                  0.31539156525252005f, -1.0925484305920792f,
+                                  0.5462742152960396f};
+__device__ const float SH_C3[] = {-0.5900435899266435f, 2.890611442640554f,
+                                  -0.4570457994644658f, 0.3731763325901154f,
+                                  -0.4570457994644658f, 1.445305721320277f,
+                                  -0.5900435899266435f};
+__forceinline__ __device__ float ndc2Pix(float v, int S) {
+  return ((v + 1.0) * S - 1.0) * 0.5;
+}
+__forceinline__ __device__ void getRect(const float2 p, int max_radius,
+                                        uint2 &rect_min, uint2 &rect_max,
+                                        dim3 grid) {
+  rect_min = {min(grid.x, max((int)0, (int)((p.x - max_radius) / BLOCK_X))),
+              min(grid.y, max((int)0, (int)((p.y - max_radius) / BLOCK_Y)))};
+  rect_max = {
+      min(grid.x,
+          max((int)0, (int)((p.x + max_radius + BLOCK_X - 1) / BLOCK_X))),
+      min(grid.y,
+          max((int)0, (int)((p.y + max_radius + BLOCK_Y - 1) / BLOCK_Y)))};
+}
+__forceinline__ __device__ float3 transformPoint4x3(const float3 &p,
+                                                    const float *matrix) {
+  float3 transformed = {
+      matrix[0] * p.x + matrix[4] * p.y + matrix[8] * p.z + matrix[12],
+      matrix[1] * p.x + matrix[5] * p.y + matrix[9] * p.z + matrix[13],
+      matrix[2] * p.x + matrix[6] * p.y + matrix[10] * p.z + matrix[14],
+  };
+  return transformed;
+}
+__forceinline__ __device__ float4 transformPoint4x4(const float3 &p,
+                                                    const float *matrix) {
+  float4 transformed = {
+      matrix[0] * p.x + matrix[4] * p.y + matrix[8] * p.z + matrix[12],
+      matrix[1] * p.x + matrix[5] * p.y + matrix[9] * p.z + matrix[13],
+      matrix[2] * p.x + matrix[6] * p.y + matrix[10] * p.z + matrix[14],
+      matrix[3] * p.x + matrix[7] * p.y + matrix[11] * p.z + matrix[15]};
+  return transformed;
+}
+__forceinline__ __device__ float3 transformVec4x3(const float3 &p,
+                                                  const float *matrix) {
+  float3 transformed = {
+      matrix[0] * p.x + matrix[4] * p.y + matrix[8] * p.z,
+      matrix[1] * p.x + matrix[5] * p.y + matrix[9] * p.z,
+      matrix[2] * p.x + matrix[6] * p.y + matrix[10] * p.z,
+  };
+  return transformed;
+}
+__forceinline__ __device__ float3
+transformVec4x3Transpose(const float3 &p, const float *matrix) {
+  float3 transformed = {
+      matrix[0] * p.x + matrix[1] * p.y + matrix[2] * p.z,
+      matrix[4] * p.x + matrix[5] * p.y + matrix[6] * p.z,
+      matrix[8] * p.x + matrix[9] * p.y + matrix[10] * p.z,
+  };
+  return transformed;
+}
+__forceinline__ __device__ float dnormvdz(float3 v, float3 dv) {
+  float sum2 = v.x * v.x + v.y * v.y + v.z * v.z;
+  float invsum32 = 1.0f / sqrt(sum2 * sum2 * sum2);
+  float dnormvdz =
+      (-v.x * v.z * dv.x - v.y * v.z * dv.y + (sum2 - v.z * v.z) * dv.z) *
+      invsum32;
+  return dnormvdz;
+}
+__forceinline__ __device__ float3 dnormvdv(float3 v, float3 dv) {
+  float sum2 = v.x * v.x + v.y * v.y + v.z * v.z;
+  float invsum32 = 1.0f / sqrt(sum2 * sum2 * sum2);
+  float3 dnormvdv;
+  dnormvdv.x =
+      ((+sum2 - v.x * v.x) * dv.x - v.y * v.x * dv.y - v.z * v.x * dv.z) *
+      invsum32;
+  dnormvdv.y =
+      (-v.x * v.y * dv.x + (sum2 - v.y * v.y) * dv.y - v.z * v.y * dv.z) *
+      invsum32;
+  dnormvdv.z =
+      (-v.x * v.z * dv.x - v.y * v.z * dv.y + (sum2 - v.z * v.z) * dv.z) *
+      invsum32;
+  return dnormvdv;
+}
+__forceinline__ __device__ float4 dnormvdv(float4 v, float4 dv) {
+  float sum2 = v.x * v.x + v.y * v.y + v.z * v.z + v.w * v.w;
+  float invsum32 = 1.0f / sqrt(sum2 * sum2 * sum2);
+  float4 vdv = {v.x * dv.x, v.y * dv.y, v.z * dv.z, v.w * dv.w};
+  float vdv_sum = vdv.x + vdv.y + vdv.z + vdv.w;
+  float4 dnormvdv;
+  dnormvdv.x = ((sum2 - v.x * v.x) * dv.x - v.x * (vdv_sum - vdv.x)) * invsum32;
+  dnormvdv.y = ((sum2 - v.y * v.y) * dv.y - v.y * (vdv_sum - vdv.y)) * invsum32;
+  dnormvdv.z = ((sum2 - v.z * v.z) * dv.z - v.z * (vdv_sum - vdv.z)) * invsum32;
+  dnormvdv.w = ((sum2 - v.w * v.w) * dv.w - v.w * (vdv_sum - vdv.w)) * invsum32;
+  return dnormvdv;
+}
+__forceinline__ __device__ float sigmoid(float x) {
+  return 1.0f / (1.0f + expf(-x));
+}
+__forceinline__ __device__ bool in_frustum(int idx, const float *orig_points,
+                                           const float *viewmatrix,
+                                           const float *projmatrix,
+                                           bool prefiltered, float3 &p_view) {
+  float3 p_orig = {orig_points[3 * idx], orig_points[3 * idx + 1],
+                   orig_points[3 * idx + 2]};
+  // Bring points to screen space
+  float4 p_hom = transformPoint4x4(p_orig, projmatrix);
+  float p_w = 1.0f / (p_hom.w + 0.0000001f);
+  float3 p_proj = {p_hom.x * p_w, p_hom.y * p_w, p_hom.z * p_w};
+  p_view = transformPoint4x3(p_orig, viewmatrix);
+  if (p_view.z <= 0.2f) // || ((p_proj.x < -1.3 || p_proj.x > 1.3 || p_proj.y <
+                        // -1.3 || p_proj.y > 1.3)))
+  {
+    if (prefiltered) {
+      printf("Point is filtered although prefiltered is set. This shouldn't "
+             "happen!");
+      __trap();
+    }
+    return false;
+  }
+  return true;
+}
+#define CHECK_CUDA(A, debug)                                                   \
+  A;                                                                           \
+  if (debug) {                                                                 \
+    auto ret = cudaDeviceSynchronize();                                        \
+    if (ret != cudaSuccess) {                                                  \
+      std::cerr << "\n[CUDA ERROR] in " << __FILE__ << "\nLine " << __LINE__   \
+                << ": " << cudaGetErrorString(ret);                            \
+      throw std::runtime_error(cudaGetErrorString(ret));                       \
+    }                                                                          \
+  }
+#endif

gaussiancity/extensions/diff_gaussian_rasterization/cuda_rasterizer/backward.cu ADDED Viewed

	@@ -0,0 +1,622 @@

+/*
+ * Copyright (C) 2023, Inria
+ * GRAPHDECO research group, https://team.inria.fr/graphdeco
+ * All rights reserved.
+ *
+ * This software is free for non-commercial, research and evaluation use
+ * under the terms of the LICENSE.md file.
+ *
+ * For inquiries contact  george.drettakis@inria.fr
+ */
+#include "auxiliary.h"
+#include "backward.h"
+#include <cooperative_groups.h>
+#include <cooperative_groups/reduce.h>
+namespace cg = cooperative_groups;
+// Backward pass for conversion of spherical harmonics to RGB for
+// each Gaussian.
+__device__ void computeColorFromSH(int idx, int deg, int max_coeffs,
+                                   const glm::vec3 *means, glm::vec3 campos,
+                                   const float *shs, const bool *clamped,
+                                   const glm::vec3 *dL_dcolor,
+                                   glm::vec3 *dL_dmeans, glm::vec3 *dL_dshs) {
+  // Compute intermediate values, as it is done during forward
+  glm::vec3 pos = means[idx];
+  glm::vec3 dir_orig = pos - campos;
+  glm::vec3 dir = dir_orig / glm::length(dir_orig);
+  glm::vec3 *sh = ((glm::vec3 *)shs) + idx * max_coeffs;
+  // Use PyTorch rule for clamping: if clamping was applied,
+  // gradient becomes 0.
+  glm::vec3 dL_dRGB = dL_dcolor[idx];
+  dL_dRGB.x *= clamped[3 * idx + 0] ? 0 : 1;
+  dL_dRGB.y *= clamped[3 * idx + 1] ? 0 : 1;
+  dL_dRGB.z *= clamped[3 * idx + 2] ? 0 : 1;
+  glm::vec3 dRGBdx(0, 0, 0);
+  glm::vec3 dRGBdy(0, 0, 0);
+  glm::vec3 dRGBdz(0, 0, 0);
+  float x = dir.x;
+  float y = dir.y;
+  float z = dir.z;
+  // Target location for this Gaussian to write SH gradients to
+  glm::vec3 *dL_dsh = dL_dshs + idx * max_coeffs;
+  // No tricks here, just high school-level calculus.
+  float dRGBdsh0 = SH_C0;
+  dL_dsh[0] = dRGBdsh0 * dL_dRGB;
+  if (deg > 0) {
+    float dRGBdsh1 = -SH_C1 * y;
+    float dRGBdsh2 = SH_C1 * z;
+    float dRGBdsh3 = -SH_C1 * x;
+    dL_dsh[1] = dRGBdsh1 * dL_dRGB;
+    dL_dsh[2] = dRGBdsh2 * dL_dRGB;
+    dL_dsh[3] = dRGBdsh3 * dL_dRGB;
+    dRGBdx = -SH_C1 * sh[3];
+    dRGBdy = -SH_C1 * sh[1];
+    dRGBdz = SH_C1 * sh[2];
+    if (deg > 1) {
+      float xx = x * x, yy = y * y, zz = z * z;
+      float xy = x * y, yz = y * z, xz = x * z;
+      float dRGBdsh4 = SH_C2[0] * xy;
+      float dRGBdsh5 = SH_C2[1] * yz;
+      float dRGBdsh6 = SH_C2[2] * (2.f * zz - xx - yy);
+      float dRGBdsh7 = SH_C2[3] * xz;
+      float dRGBdsh8 = SH_C2[4] * (xx - yy);
+      dL_dsh[4] = dRGBdsh4 * dL_dRGB;
+      dL_dsh[5] = dRGBdsh5 * dL_dRGB;
+      dL_dsh[6] = dRGBdsh6 * dL_dRGB;
+      dL_dsh[7] = dRGBdsh7 * dL_dRGB;
+      dL_dsh[8] = dRGBdsh8 * dL_dRGB;
+      dRGBdx += SH_C2[0] * y * sh[4] + SH_C2[2] * 2.f * -x * sh[6] +
+                SH_C2[3] * z * sh[7] + SH_C2[4] * 2.f * x * sh[8];
+      dRGBdy += SH_C2[0] * x * sh[4] + SH_C2[1] * z * sh[5] +
+                SH_C2[2] * 2.f * -y * sh[6] + SH_C2[4] * 2.f * -y * sh[8];
+      dRGBdz += SH_C2[1] * y * sh[5] + SH_C2[2] * 2.f * 2.f * z * sh[6] +
+                SH_C2[3] * x * sh[7];
+      if (deg > 2) {
+        float dRGBdsh9 = SH_C3[0] * y * (3.f * xx - yy);
+        float dRGBdsh10 = SH_C3[1] * xy * z;
+        float dRGBdsh11 = SH_C3[2] * y * (4.f * zz - xx - yy);
+        float dRGBdsh12 = SH_C3[3] * z * (2.f * zz - 3.f * xx - 3.f * yy);
+        float dRGBdsh13 = SH_C3[4] * x * (4.f * zz - xx - yy);
+        float dRGBdsh14 = SH_C3[5] * z * (xx - yy);
+        float dRGBdsh15 = SH_C3[6] * x * (xx - 3.f * yy);
+        dL_dsh[9] = dRGBdsh9 * dL_dRGB;
+        dL_dsh[10] = dRGBdsh10 * dL_dRGB;
+        dL_dsh[11] = dRGBdsh11 * dL_dRGB;
+        dL_dsh[12] = dRGBdsh12 * dL_dRGB;
+        dL_dsh[13] = dRGBdsh13 * dL_dRGB;
+        dL_dsh[14] = dRGBdsh14 * dL_dRGB;
+        dL_dsh[15] = dRGBdsh15 * dL_dRGB;
+        dRGBdx += (SH_C3[0] * sh[9] * 3.f * 2.f * xy + SH_C3[1] * sh[10] * yz +
+                   SH_C3[2] * sh[11] * -2.f * xy +
+                   SH_C3[3] * sh[12] * -3.f * 2.f * xz +
+                   SH_C3[4] * sh[13] * (-3.f * xx + 4.f * zz - yy) +
+                   SH_C3[5] * sh[14] * 2.f * xz +
+                   SH_C3[6] * sh[15] * 3.f * (xx - yy));
+        dRGBdy +=
+            (SH_C3[0] * sh[9] * 3.f * (xx - yy) + SH_C3[1] * sh[10] * xz +
+             SH_C3[2] * sh[11] * (-3.f * yy + 4.f * zz - xx) +
+             SH_C3[3] * sh[12] * -3.f * 2.f * yz +
+             SH_C3[4] * sh[13] * -2.f * xy + SH_C3[5] * sh[14] * -2.f * yz +
+             SH_C3[6] * sh[15] * -3.f * 2.f * xy);
+        dRGBdz += (SH_C3[1] * sh[10] * xy + SH_C3[2] * sh[11] * 4.f * 2.f * yz +
+                   SH_C3[3] * sh[12] * 3.f * (2.f * zz - xx - yy) +
+                   SH_C3[4] * sh[13] * 4.f * 2.f * xz +
+                   SH_C3[5] * sh[14] * (xx - yy));
+      }
+    }
+  }
+  // The view direction is an input to the computation. View direction
+  // is influenced by the Gaussian's mean, so SHs gradients
+  // must propagate back into 3D position.
+  glm::vec3 dL_ddir(glm::dot(dRGBdx, dL_dRGB), glm::dot(dRGBdy, dL_dRGB),
+                    glm::dot(dRGBdz, dL_dRGB));
+  // Account for normalization of direction
+  float3 dL_dmean = dnormvdv(float3{dir_orig.x, dir_orig.y, dir_orig.z},
+                             float3{dL_ddir.x, dL_ddir.y, dL_ddir.z});
+  // Gradients of loss w.r.t. Gaussian means, but only the portion
+  // that is caused because the mean affects the view-dependent color.
+  // Additional mean gradient is accumulated in below methods.
+  dL_dmeans[idx] += glm::vec3(dL_dmean.x, dL_dmean.y, dL_dmean.z);
+}
+// Backward version of INVERSE 2D covariance matrix computation
+// (due to length launched as separate kernel before other
+// backward steps contained in preprocess)
+__global__ void computeCov2DCUDA(int P, const float3 *means, const int *radii,
+                                 const float *cov3Ds, const float h_x,
+                                 float h_y, const float tan_fovx,
+                                 float tan_fovy, const float *view_matrix,
+                                 const float *dL_dconics, float3 *dL_dmeans,
+                                 float *dL_dcov) {
+  auto idx = cg::this_grid().thread_rank();
+  if (idx >= P || !(radii[idx] > 0))
+    return;
+  // Reading location of 3D covariance for this Gaussian
+  const float *cov3D = cov3Ds + 6 * idx;
+  // Fetch gradients, recompute 2D covariance and relevant
+  // intermediate forward results needed in the backward.
+  float3 mean = means[idx];
+  float3 dL_dconic = {dL_dconics[4 * idx], dL_dconics[4 * idx + 1],
+                      dL_dconics[4 * idx + 3]};
+  float3 t = transformPoint4x3(mean, view_matrix);
+  const float limx = 1.3f * tan_fovx;
+  const float limy = 1.3f * tan_fovy;
+  const float txtz = t.x / t.z;
+  const float tytz = t.y / t.z;
+  t.x = min(limx, max(-limx, txtz)) * t.z;
+  t.y = min(limy, max(-limy, tytz)) * t.z;
+  const float x_grad_mul = txtz < -limx || txtz > limx ? 0 : 1;
+  const float y_grad_mul = tytz < -limy || tytz > limy ? 0 : 1;
+  glm::mat3 J = glm::mat3(h_x / t.z, 0.0f, -(h_x * t.x) / (t.z * t.z), 0.0f,
+                          h_y / t.z, -(h_y * t.y) / (t.z * t.z), 0, 0, 0);
+  glm::mat3 W = glm::mat3(view_matrix[0], view_matrix[4], view_matrix[8],
+                          view_matrix[1], view_matrix[5], view_matrix[9],
+                          view_matrix[2], view_matrix[6], view_matrix[10]);
+  glm::mat3 Vrk = glm::mat3(cov3D[0], cov3D[1], cov3D[2], cov3D[1], cov3D[3],
+                            cov3D[4], cov3D[2], cov3D[4], cov3D[5]);
+  glm::mat3 T = W * J;
+  glm::mat3 cov2D = glm::transpose(T) * glm::transpose(Vrk) * T;
+  // Use helper variables for 2D covariance entries. More compact.
+  float a = cov2D[0][0] += 0.3f;
+  float b = cov2D[0][1];
+  float c = cov2D[1][1] += 0.3f;
+  float denom = a * c - b * b;
+  float dL_da = 0, dL_db = 0, dL_dc = 0;
+  float denom2inv = 1.0f / ((denom * denom) + 0.0000001f);
+  if (denom2inv != 0) {
+    // Gradients of loss w.r.t. entries of 2D covariance matrix,
+    // given gradients of loss w.r.t. conic matrix (inverse covariance matrix).
+    // e.g., dL / da = dL / d_conic_a * d_conic_a / d_a
+    dL_da = denom2inv * (-c * c * dL_dconic.x + 2 * b * c * dL_dconic.y +
+                         (denom - a * c) * dL_dconic.z);
+    dL_dc = denom2inv * (-a * a * dL_dconic.z + 2 * a * b * dL_dconic.y +
+                         (denom - a * c) * dL_dconic.x);
+    dL_db = denom2inv * 2 *
+            (b * c * dL_dconic.x - (denom + 2 * b * b) * dL_dconic.y +
+             a * b * dL_dconic.z);
+    // Gradients of loss L w.r.t. each 3D covariance matrix (Vrk) entry,
+    // given gradients w.r.t. 2D covariance matrix (diagonal).
+    // cov2D = transpose(T) * transpose(Vrk) * T;
+    dL_dcov[6 * idx + 0] =
+        (T[0][0] * T[0][0] * dL_da + T[0][0] * T[1][0] * dL_db +
+         T[1][0] * T[1][0] * dL_dc);
+    dL_dcov[6 * idx + 3] =
+        (T[0][1] * T[0][1] * dL_da + T[0][1] * T[1][1] * dL_db +
+         T[1][1] * T[1][1] * dL_dc);
+    dL_dcov[6 * idx + 5] =
+        (T[0][2] * T[0][2] * dL_da + T[0][2] * T[1][2] * dL_db +
+         T[1][2] * T[1][2] * dL_dc);
+    // Gradients of loss L w.r.t. each 3D covariance matrix (Vrk) entry,
+    // given gradients w.r.t. 2D covariance matrix (off-diagonal).
+    // Off-diagonal elements appear twice --> double the gradient.
+    // cov2D = transpose(T) * transpose(Vrk) * T;
+    dL_dcov[6 * idx + 1] = 2 * T[0][0] * T[0][1] * dL_da +
+                           (T[0][0] * T[1][1] + T[0][1] * T[1][0]) * dL_db +
+                           2 * T[1][0] * T[1][1] * dL_dc;
+    dL_dcov[6 * idx + 2] = 2 * T[0][0] * T[0][2] * dL_da +
+                           (T[0][0] * T[1][2] + T[0][2] * T[1][0]) * dL_db +
+                           2 * T[1][0] * T[1][2] * dL_dc;
+    dL_dcov[6 * idx + 4] = 2 * T[0][2] * T[0][1] * dL_da +
+                           (T[0][1] * T[1][2] + T[0][2] * T[1][1]) * dL_db +
+                           2 * T[1][1] * T[1][2] * dL_dc;
+  } else {
+    for (int i = 0; i < 6; i++)
+      dL_dcov[6 * idx + i] = 0;
+  }
+  // Gradients of loss w.r.t. upper 2x3 portion of intermediate matrix T
+  // cov2D = transpose(T) * transpose(Vrk) * T;
+  float dL_dT00 =
+      2 * (T[0][0] * Vrk[0][0] + T[0][1] * Vrk[0][1] + T[0][2] * Vrk[0][2]) *
+          dL_da +
+      (T[1][0] * Vrk[0][0] + T[1][1] * Vrk[0][1] + T[1][2] * Vrk[0][2]) * dL_db;
+  float dL_dT01 =
+      2 * (T[0][0] * Vrk[1][0] + T[0][1] * Vrk[1][1] + T[0][2] * Vrk[1][2]) *
+          dL_da +
+      (T[1][0] * Vrk[1][0] + T[1][1] * Vrk[1][1] + T[1][2] * Vrk[1][2]) * dL_db;
+  float dL_dT02 =
+      2 * (T[0][0] * Vrk[2][0] + T[0][1] * Vrk[2][1] + T[0][2] * Vrk[2][2]) *
+          dL_da +
+      (T[1][0] * Vrk[2][0] + T[1][1] * Vrk[2][1] + T[1][2] * Vrk[2][2]) * dL_db;
+  float dL_dT10 =
+      2 * (T[1][0] * Vrk[0][0] + T[1][1] * Vrk[0][1] + T[1][2] * Vrk[0][2]) *
+          dL_dc +
+      (T[0][0] * Vrk[0][0] + T[0][1] * Vrk[0][1] + T[0][2] * Vrk[0][2]) * dL_db;
+  float dL_dT11 =
+      2 * (T[1][0] * Vrk[1][0] + T[1][1] * Vrk[1][1] + T[1][2] * Vrk[1][2]) *
+          dL_dc +
+      (T[0][0] * Vrk[1][0] + T[0][1] * Vrk[1][1] + T[0][2] * Vrk[1][2]) * dL_db;
+  float dL_dT12 =
+      2 * (T[1][0] * Vrk[2][0] + T[1][1] * Vrk[2][1] + T[1][2] * Vrk[2][2]) *
+          dL_dc +
+      (T[0][0] * Vrk[2][0] + T[0][1] * Vrk[2][1] + T[0][2] * Vrk[2][2]) * dL_db;
+  // Gradients of loss w.r.t. upper 3x2 non-zero entries of Jacobian matrix
+  // T = W * J
+  float dL_dJ00 = W[0][0] * dL_dT00 + W[0][1] * dL_dT01 + W[0][2] * dL_dT02;
+  float dL_dJ02 = W[2][0] * dL_dT00 + W[2][1] * dL_dT01 + W[2][2] * dL_dT02;
+  float dL_dJ11 = W[1][0] * dL_dT10 + W[1][1] * dL_dT11 + W[1][2] * dL_dT12;
+  float dL_dJ12 = W[2][0] * dL_dT10 + W[2][1] * dL_dT11 + W[2][2] * dL_dT12;
+  float tz = 1.f / t.z;
+  float tz2 = tz * tz;
+  float tz3 = tz2 * tz;
+  // Gradients of loss w.r.t. transformed Gaussian mean t
+  float dL_dtx = x_grad_mul * -h_x * tz2 * dL_dJ02;
+  float dL_dty = y_grad_mul * -h_y * tz2 * dL_dJ12;
+  float dL_dtz = -h_x * tz2 * dL_dJ00 - h_y * tz2 * dL_dJ11 +
+                 (2 * h_x * t.x) * tz3 * dL_dJ02 +
+                 (2 * h_y * t.y) * tz3 * dL_dJ12;
+  // Account for transformation of mean to t
+  // t = transformPoint4x3(mean, view_matrix);
+  float3 dL_dmean =
+      transformVec4x3Transpose({dL_dtx, dL_dty, dL_dtz}, view_matrix);
+  // Gradients of loss w.r.t. Gaussian means, but only the portion
+  // that is caused because the mean affects the covariance matrix.
+  // Additional mean gradient is accumulated in BACKWARD::preprocess.
+  dL_dmeans[idx] = dL_dmean;
+}
+// Backward pass for the conversion of scale and rotation to a
+// 3D covariance matrix for each Gaussian.
+__device__ void computeCov3D(int idx, const glm::vec3 scale, float mod,
+                             const glm::vec4 rot, const float *dL_dcov3Ds,
+                             glm::vec3 *dL_dscales, glm::vec4 *dL_drots) {
+  // Recompute (intermediate) results for the 3D covariance computation.
+  glm::vec4 q = rot; // / glm::length(rot);
+  float r = q.x;
+  float x = q.y;
+  float y = q.z;
+  float z = q.w;
+  glm::mat3 R = glm::mat3(1.f - 2.f * (y * y + z * z), 2.f * (x * y - r * z),
+                          2.f * (x * z + r * y), 2.f * (x * y + r * z),
+                          1.f - 2.f * (x * x + z * z), 2.f * (y * z - r * x),
+                          2.f * (x * z - r * y), 2.f * (y * z + r * x),
+                          1.f - 2.f * (x * x + y * y));
+  glm::mat3 S = glm::mat3(1.0f);
+  glm::vec3 s = mod * scale;
+  S[0][0] = s.x;
+  S[1][1] = s.y;
+  S[2][2] = s.z;
+  glm::mat3 M = S * R;
+  const float *dL_dcov3D = dL_dcov3Ds + 6 * idx;
+  glm::vec3 dunc(dL_dcov3D[0], dL_dcov3D[3], dL_dcov3D[5]);
+  glm::vec3 ounc = 0.5f * glm::vec3(dL_dcov3D[1], dL_dcov3D[2], dL_dcov3D[4]);
+  // Convert per-element covariance loss gradients to matrix form
+  glm::mat3 dL_dSigma =
+      glm::mat3(dL_dcov3D[0], 0.5f * dL_dcov3D[1], 0.5f * dL_dcov3D[2],
+                0.5f * dL_dcov3D[1], dL_dcov3D[3], 0.5f * dL_dcov3D[4],
+                0.5f * dL_dcov3D[2], 0.5f * dL_dcov3D[4], dL_dcov3D[5]);
+  // Compute loss gradient w.r.t. matrix M
+  // dSigma_dM = 2 * M
+  glm::mat3 dL_dM = 2.0f * M * dL_dSigma;
+  glm::mat3 Rt = glm::transpose(R);
+  glm::mat3 dL_dMt = glm::transpose(dL_dM);
+  // Gradients of loss w.r.t. scale
+  glm::vec3 *dL_dscale = dL_dscales + idx;
+  dL_dscale->x = glm::dot(Rt[0], dL_dMt[0]);
+  dL_dscale->y = glm::dot(Rt[1], dL_dMt[1]);
+  dL_dscale->z = glm::dot(Rt[2], dL_dMt[2]);
+  dL_dMt[0] *= s.x;
+  dL_dMt[1] *= s.y;
+  dL_dMt[2] *= s.z;
+  // Gradients of loss w.r.t. normalized quaternion
+  glm::vec4 dL_dq;
+  dL_dq.x = 2 * z * (dL_dMt[0][1] - dL_dMt[1][0]) +
+            2 * y * (dL_dMt[2][0] - dL_dMt[0][2]) +
+            2 * x * (dL_dMt[1][2] - dL_dMt[2][1]);
+  dL_dq.y = 2 * y * (dL_dMt[1][0] + dL_dMt[0][1]) +
+            2 * z * (dL_dMt[2][0] + dL_dMt[0][2]) +
+            2 * r * (dL_dMt[1][2] - dL_dMt[2][1]) -
+            4 * x * (dL_dMt[2][2] + dL_dMt[1][1]);
+  dL_dq.z = 2 * x * (dL_dMt[1][0] + dL_dMt[0][1]) +
+            2 * r * (dL_dMt[2][0] - dL_dMt[0][2]) +
+            2 * z * (dL_dMt[1][2] + dL_dMt[2][1]) -
+            4 * y * (dL_dMt[2][2] + dL_dMt[0][0]);
+  dL_dq.w = 2 * r * (dL_dMt[0][1] - dL_dMt[1][0]) +
+            2 * x * (dL_dMt[2][0] + dL_dMt[0][2]) +
+            2 * y * (dL_dMt[1][2] + dL_dMt[2][1]) -
+            4 * z * (dL_dMt[1][1] + dL_dMt[0][0]);
+  // Gradients of loss w.r.t. unnormalized quaternion
+  float4 *dL_drot = (float4 *)(dL_drots + idx);
+  *dL_drot = float4{dL_dq.x, dL_dq.y, dL_dq.z,
+                    dL_dq.w}; // dnormvdv(float4{ rot.x, rot.y, rot.z, rot.w },
+                              // float4{ dL_dq.x, dL_dq.y, dL_dq.z, dL_dq.w });
+}
+// Backward pass of the preprocessing steps, except
+// for the covariance computation and inversion
+// (those are handled by a previous kernel call)
+template <int C>
+__global__ void
+preprocessCUDA(int P, int D, int M, const float3 *means, const int *radii,
+               const float *shs, const bool *clamped, const glm::vec3 *scales,
+               const glm::vec4 *rotations, const float scale_modifier,
+               const float *proj, const glm::vec3 *campos,
+               const float3 *dL_dmean2D, glm::vec3 *dL_dmeans, float *dL_dcolor,
+               float *dL_dcov3D, float *dL_dsh, glm::vec3 *dL_dscale,
+               glm::vec4 *dL_drot) {
+  auto idx = cg::this_grid().thread_rank();
+  if (idx >= P || !(radii[idx] > 0))
+    return;
+  float3 m = means[idx];
+  // Taking care of gradients from the screenspace points
+  float4 m_hom = transformPoint4x4(m, proj);
+  float m_w = 1.0f / (m_hom.w + 0.0000001f);
+  // Compute loss gradient w.r.t. 3D means due to gradients of 2D means
+  // from rendering procedure
+  glm::vec3 dL_dmean;
+  float mul1 =
+      (proj[0] * m.x + proj[4] * m.y + proj[8] * m.z + proj[12]) * m_w * m_w;
+  float mul2 =
+      (proj[1] * m.x + proj[5] * m.y + proj[9] * m.z + proj[13]) * m_w * m_w;
+  dL_dmean.x = (proj[0] * m_w - proj[3] * mul1) * dL_dmean2D[idx].x +
+               (proj[1] * m_w - proj[3] * mul2) * dL_dmean2D[idx].y;
+  dL_dmean.y = (proj[4] * m_w - proj[7] * mul1) * dL_dmean2D[idx].x +
+               (proj[5] * m_w - proj[7] * mul2) * dL_dmean2D[idx].y;
+  dL_dmean.z = (proj[8] * m_w - proj[11] * mul1) * dL_dmean2D[idx].x +
+               (proj[9] * m_w - proj[11] * mul2) * dL_dmean2D[idx].y;
+  // That's the second part of the mean gradient. Previous computation
+  // of cov2D and following SH conversion also affects it.
+  dL_dmeans[idx] += dL_dmean;
+  // Compute gradient updates due to computing colors from SHs
+  if (shs)
+    computeColorFromSH(idx, D, M, (glm::vec3 *)means, *campos, shs, clamped,
+                       (glm::vec3 *)dL_dcolor, (glm::vec3 *)dL_dmeans,
+                       (glm::vec3 *)dL_dsh);
+  // Compute gradient updates due to computing covariance from scale/rotation
+  if (scales)
+    computeCov3D(idx, scales[idx], scale_modifier, rotations[idx], dL_dcov3D,
+                 dL_dscale, dL_drot);
+}
+// Backward version of the rendering procedure.
+template <uint32_t C>
+__global__ void __launch_bounds__(BLOCK_X *BLOCK_Y) renderCUDA(
+    const uint2 *__restrict__ ranges, const uint32_t *__restrict__ point_list,
+    int W, int H, const float *__restrict__ bg_color,
+    const float2 *__restrict__ points_xy_image,
+    const float4 *__restrict__ conic_opacity, const float *__restrict__ colors,
+    const float *__restrict__ final_Ts, const uint32_t *__restrict__ n_contrib,
+    const float *__restrict__ dL_dpixels, float3 *__restrict__ dL_dmean2D,
+    float4 *__restrict__ dL_dconic2D, float *__restrict__ dL_dopacity,
+    float *__restrict__ dL_dcolors) {
+  // We rasterize again. Compute necessary block info.
+  auto block = cg::this_thread_block();
+  const uint32_t horizontal_blocks = (W + BLOCK_X - 1) / BLOCK_X;
+  const uint2 pix_min = {block.group_index().x * BLOCK_X,
+                         block.group_index().y * BLOCK_Y};
+  const uint2 pix_max = {min(pix_min.x + BLOCK_X, W),
+                         min(pix_min.y + BLOCK_Y, H)};
+  const uint2 pix = {pix_min.x + block.thread_index().x,
+                     pix_min.y + block.thread_index().y};
+  const uint32_t pix_id = W * pix.y + pix.x;
+  const float2 pixf = {(float)pix.x, (float)pix.y};
+  const bool inside = pix.x < W && pix.y < H;
+  const uint2 range =
+      ranges[block.group_index().y * horizontal_blocks + block.group_index().x];
+  const int rounds = ((range.y - range.x + BLOCK_SIZE - 1) / BLOCK_SIZE);
+  bool done = !inside;
+  int toDo = range.y - range.x;
+  __shared__ int collected_id[BLOCK_SIZE];
+  __shared__ float2 collected_xy[BLOCK_SIZE];
+  __shared__ float4 collected_conic_opacity[BLOCK_SIZE];
+  __shared__ float collected_colors[C * BLOCK_SIZE];
+  // In the forward, we stored the final value for T, the
+  // product of all (1 - alpha) factors.
+  const float T_final = inside ? final_Ts[pix_id] : 0;
+  float T = T_final;
+  // We start from the back. The ID of the last contributing
+  // Gaussian is known from each pixel from the forward.
+  uint32_t contributor = toDo;
+  const int last_contributor = inside ? n_contrib[pix_id] : 0;
+  float accum_rec[C] = {0};
+  float dL_dpixel[C];
+  if (inside)
+    for (int i = 0; i < C; i++)
+      dL_dpixel[i] = dL_dpixels[i * H * W + pix_id];
+  float last_alpha = 0;
+  float last_color[C] = {0};
+  // Gradient of pixel coordinate w.r.t. normalized
+  // screen-space viewport corrdinates (-1 to 1)
+  const float ddelx_dx = 0.5 * W;
+  const float ddely_dy = 0.5 * H;
+  // Traverse all Gaussians
+  for (int i = 0; i < rounds; i++, toDo -= BLOCK_SIZE) {
+    // Load auxiliary data into shared memory, start in the BACK
+    // and load them in revers order.
+    block.sync();
+    const int progress = i * BLOCK_SIZE + block.thread_rank();
+    if (range.x + progress < range.y) {
+      const int coll_id = point_list[range.y - progress - 1];
+      collected_id[block.thread_rank()] = coll_id;
+      collected_xy[block.thread_rank()] = points_xy_image[coll_id];
+      collected_conic_opacity[block.thread_rank()] = conic_opacity[coll_id];
+      for (int i = 0; i < C; i++)
+        collected_colors[i * BLOCK_SIZE + block.thread_rank()] =
+            colors[coll_id * C + i];
+    }
+    block.sync();
+    // Iterate over Gaussians
+    for (int j = 0; !done && j < min(BLOCK_SIZE, toDo); j++) {
+      // Keep track of current Gaussian ID. Skip, if this one
+      // is behind the last contributor for this pixel.
+      contributor--;
+      if (contributor >= last_contributor)
+        continue;
+      // Compute blending values, as before.
+      const float2 xy = collected_xy[j];
+      const float2 d = {xy.x - pixf.x, xy.y - pixf.y};
+      const float4 con_o = collected_conic_opacity[j];
+      const float power = -0.5f * (con_o.x * d.x * d.x + con_o.z * d.y * d.y) -
+                          con_o.y * d.x * d.y;
+      if (power > 0.0f)
+        continue;
+      const float G = exp(power);
+      const float alpha = min(0.99f, con_o.w * G);
+      if (alpha < 1.0f / 255.0f)
+        continue;
+      T = T / (1.f - alpha);
+      const float dchannel_dcolor = alpha * T;
+      // Propagate gradients to per-Gaussian colors and keep
+      // gradients w.r.t. alpha (blending factor for a Gaussian/pixel
+      // pair).
+      float dL_dalpha = 0.0f;
+      const int global_id = collected_id[j];
+      for (int ch = 0; ch < C; ch++) {
+        const float c = collected_colors[ch * BLOCK_SIZE + j];
+        // Update last color (to be used in the next iteration)
+        accum_rec[ch] =
+            last_alpha * last_color[ch] + (1.f - last_alpha) * accum_rec[ch];
+        last_color[ch] = c;
+        const float dL_dchannel = dL_dpixel[ch];
+        dL_dalpha += (c - accum_rec[ch]) * dL_dchannel;
+        // Update the gradients w.r.t. color of the Gaussian.
+        // Atomic, since this pixel is just one of potentially
+        // many that were affected by this Gaussian.
+        atomicAdd(&(dL_dcolors[global_id * C + ch]),
+                  dchannel_dcolor * dL_dchannel);
+      }
+      dL_dalpha *= T;
+      // Update last alpha (to be used in the next iteration)
+      last_alpha = alpha;
+      // Account for fact that alpha also influences how much of
+      // the background color is added if nothing left to blend
+      float bg_dot_dpixel = 0;
+      for (int i = 0; i < C; i++)
+        bg_dot_dpixel += bg_color[i] * dL_dpixel[i];
+      dL_dalpha += (-T_final / (1.f - alpha)) * bg_dot_dpixel;
+      // Helpful reusable temporary variables
+      const float dL_dG = con_o.w * dL_dalpha;
+      const float gdx = G * d.x;
+      const float gdy = G * d.y;
+      const float dG_ddelx = -gdx * con_o.x - gdy * con_o.y;
+      const float dG_ddely = -gdy * con_o.z - gdx * con_o.y;
+      // Update gradients w.r.t. 2D mean position of the Gaussian
+      atomicAdd(&dL_dmean2D[global_id].x, dL_dG * dG_ddelx * ddelx_dx);
+      atomicAdd(&dL_dmean2D[global_id].y, dL_dG * dG_ddely * ddely_dy);
+      // Update gradients w.r.t. 2D covariance (2x2 matrix, symmetric)
+      atomicAdd(&dL_dconic2D[global_id].x, -0.5f * gdx * d.x * dL_dG);
+      atomicAdd(&dL_dconic2D[global_id].y, -0.5f * gdx * d.y * dL_dG);
+      atomicAdd(&dL_dconic2D[global_id].w, -0.5f * gdy * d.y * dL_dG);
+      // Update gradients w.r.t. opacity of the Gaussian
+      atomicAdd(&(dL_dopacity[global_id]), G * dL_dalpha);
+    }
+  }
+}
+void BACKWARD::preprocess(
+    int P, int D, int M, const float3 *means3D, const int *radii,
+    const float *shs, const bool *clamped, const glm::vec3 *scales,
+    const glm::vec4 *rotations, const float scale_modifier, const float *cov3Ds,
+    const float *viewmatrix, const float *projmatrix, const float focal_x,
+    float focal_y, const float tan_fovx, float tan_fovy,
+    const glm::vec3 *campos, const float3 *dL_dmean2D, const float *dL_dconic,
+    glm::vec3 *dL_dmean3D, float *dL_dcolor, float *dL_dcov3D, float *dL_dsh,
+    glm::vec3 *dL_dscale, glm::vec4 *dL_drot) {
+  // Propagate gradients for the path of 2D conic matrix computation.
+  // Somewhat long, thus it is its own kernel rather than being part of
+  // "preprocess". When done, loss gradient w.r.t. 3D means has been
+  // modified and gradient w.r.t. 3D covariance matrix has been computed.
+  computeCov2DCUDA<<<(P + 255) / 256, 256>>>(
+      P, means3D, radii, cov3Ds, focal_x, focal_y, tan_fovx, tan_fovy,
+      viewmatrix, dL_dconic, (float3 *)dL_dmean3D, dL_dcov3D);
+  // Propagate gradients for remaining steps: finish 3D mean gradients,
+  // propagate color gradients to SH (if desireD), propagate 3D covariance
+  // matrix gradients to scale and rotation.
+  preprocessCUDA<NUM_CHANNELS><<<(P + 255) / 256, 256>>>(
+      P, D, M, (float3 *)means3D, radii, shs, clamped, (glm::vec3 *)scales,
+      (glm::vec4 *)rotations, scale_modifier, projmatrix, campos,
+      (float3 *)dL_dmean2D, (glm::vec3 *)dL_dmean3D, dL_dcolor, dL_dcov3D,
+      dL_dsh, dL_dscale, dL_drot);
+}
+void BACKWARD::render(const dim3 grid, const dim3 block, const uint2 *ranges,
+                      const uint32_t *point_list, int W, int H,
+                      const float *bg_color, const float2 *means2D,
+                      const float4 *conic_opacity, const float *colors,
+                      const float *final_Ts, const uint32_t *n_contrib,
+                      const float *dL_dpixels, float3 *dL_dmean2D,
+                      float4 *dL_dconic2D, float *dL_dopacity,
+                      float *dL_dcolors) {
+  renderCUDA<NUM_CHANNELS>
+      <<<grid, block>>>(ranges, point_list, W, H, bg_color, means2D,
+                        conic_opacity, colors, final_Ts, n_contrib, dL_dpixels,
+                        dL_dmean2D, dL_dconic2D, dL_dopacity, dL_dcolors);
+}

gaussiancity/extensions/diff_gaussian_rasterization/cuda_rasterizer/backward.h ADDED Viewed

	@@ -0,0 +1,41 @@

+/*
+ * Copyright (C) 2023, Inria
+ * GRAPHDECO research group, https://team.inria.fr/graphdeco
+ * All rights reserved.
+ *
+ * This software is free for non-commercial, research and evaluation use
+ * under the terms of the LICENSE.md file.
+ *
+ * For inquiries contact  george.drettakis@inria.fr
+ */
+#ifndef CUDA_RASTERIZER_BACKWARD_H_INCLUDED
+#define CUDA_RASTERIZER_BACKWARD_H_INCLUDED
+#include "cuda_runtime.h"
+#include "device_launch_parameters.h"
+#include <cuda.h>
+#define GLM_FORCE_CUDA
+#include <glm/glm.hpp>
+namespace BACKWARD {
+void render(const dim3 grid, dim3 block, const uint2 *ranges,
+            const uint32_t *point_list, int W, int H, const float *bg_color,
+            const float2 *means2D, const float4 *conic_opacity,
+            const float *colors, const float *final_Ts,
+            const uint32_t *n_contrib, const float *dL_dpixels,
+            float3 *dL_dmean2D, float4 *dL_dconic2D, float *dL_dopacity,
+            float *dL_dcolors);
+void preprocess(int P, int D, int M, const float3 *means, const int *radii,
+                const float *shs, const bool *clamped, const glm::vec3 *scales,
+                const glm::vec4 *rotations, const float scale_modifier,
+                const float *cov3Ds, const float *view, const float *proj,
+                const float focal_x, float focal_y, const float tan_fovx,
+                float tan_fovy, const glm::vec3 *campos,
+                const float3 *dL_dmean2D, const float *dL_dconics,
+                glm::vec3 *dL_dmeans, float *dL_dcolor, float *dL_dcov3D,
+                float *dL_dsh, glm::vec3 *dL_dscale, glm::vec4 *dL_drot);
+} // namespace BACKWARD
+#endif

gaussiancity/extensions/diff_gaussian_rasterization/cuda_rasterizer/config.h ADDED Viewed

	@@ -0,0 +1,19 @@

+/*
+ * Copyright (C) 2023, Inria
+ * GRAPHDECO research group, https://team.inria.fr/graphdeco
+ * All rights reserved.
+ *
+ * This software is free for non-commercial, research and evaluation use
+ * under the terms of the LICENSE.md file.
+ *
+ * For inquiries contact  george.drettakis@inria.fr
+ */
+#ifndef CUDA_RASTERIZER_CONFIG_H_INCLUDED
+#define CUDA_RASTERIZER_CONFIG_H_INCLUDED
+#define NUM_CHANNELS 3 // Default 3, RGB
+#define BLOCK_X 16
+#define BLOCK_Y 16
+#endif

gaussiancity/extensions/diff_gaussian_rasterization/cuda_rasterizer/forward.cu ADDED Viewed

	@@ -0,0 +1,376 @@

+/*
+ * Copyright (C) 2023, Inria
+ * GRAPHDECO research group, https://team.inria.fr/graphdeco
+ * All rights reserved.
+ *
+ * This software is free for non-commercial, research and evaluation use
+ * under the terms of the LICENSE.md file.
+ *
+ * For inquiries contact  george.drettakis@inria.fr
+ */
+#include "auxiliary.h"
+#include "forward.h"
+#include <cooperative_groups.h>
+#include <cooperative_groups/reduce.h>
+namespace cg = cooperative_groups;
+// Forward method for converting the input spherical harmonics
+// coefficients of each Gaussian to a simple RGB color.
+__device__ glm::vec3 computeColorFromSH(int idx, int deg, int max_coeffs,
+                                        const glm::vec3 *means,
+                                        glm::vec3 campos, const float *shs,
+                                        bool *clamped) {
+  // The implementation is loosely based on code for
+  // "Differentiable Point-Based Radiance Fields for
+  // Efficient View Synthesis" by Zhang et al. (2022)
+  glm::vec3 pos = means[idx];
+  glm::vec3 dir = pos - campos;
+  dir = dir / glm::length(dir);
+  glm::vec3 *sh = ((glm::vec3 *)shs) + idx * max_coeffs;
+  glm::vec3 result = SH_C0 * sh[0];
+  if (deg > 0) {
+    float x = dir.x;
+    float y = dir.y;
+    float z = dir.z;
+    result = result - SH_C1 * y * sh[1] + SH_C1 * z * sh[2] - SH_C1 * x * sh[3];
+    if (deg > 1) {
+      float xx = x * x, yy = y * y, zz = z * z;
+      float xy = x * y, yz = y * z, xz = x * z;
+      result = result + SH_C2[0] * xy * sh[4] + SH_C2[1] * yz * sh[5] +
+               SH_C2[2] * (2.0f * zz - xx - yy) * sh[6] +
+               SH_C2[3] * xz * sh[7] + SH_C2[4] * (xx - yy) * sh[8];
+      if (deg > 2) {
+        result = result + SH_C3[0] * y * (3.0f * xx - yy) * sh[9] +
+                 SH_C3[1] * xy * z * sh[10] +
+                 SH_C3[2] * y * (4.0f * zz - xx - yy) * sh[11] +
+                 SH_C3[3] * z * (2.0f * zz - 3.0f * xx - 3.0f * yy) * sh[12] +
+                 SH_C3[4] * x * (4.0f * zz - xx - yy) * sh[13] +
+                 SH_C3[5] * z * (xx - yy) * sh[14] +
+                 SH_C3[6] * x * (xx - 3.0f * yy) * sh[15];
+      }
+    }
+  }
+  result += 0.5f;
+  // RGB colors are clamped to positive values. If values are
+  // clamped, we need to keep track of this for the backward pass.
+  clamped[3 * idx + 0] = (result.x < 0);
+  clamped[3 * idx + 1] = (result.y < 0);
+  clamped[3 * idx + 2] = (result.z < 0);
+  return glm::max(result, 0.0f);
+}
+// Forward version of 2D covariance matrix computation
+__device__ float3 computeCov2D(const float3 &mean, float focal_x, float focal_y,
+                               float tan_fovx, float tan_fovy,
+                               const float *cov3D, const float *viewmatrix) {
+  // The following models the steps outlined by equations 29
+  // and 31 in "EWA Splatting" (Zwicker et al., 2002).
+  // Additionally considers aspect / scaling of viewport.
+  // Transposes used to account for row-/column-major conventions.
+  float3 t = transformPoint4x3(mean, viewmatrix);
+  const float limx = 1.3f * tan_fovx;
+  const float limy = 1.3f * tan_fovy;
+  const float txtz = t.x / t.z;
+  const float tytz = t.y / t.z;
+  t.x = min(limx, max(-limx, txtz)) * t.z;
+  t.y = min(limy, max(-limy, tytz)) * t.z;
+  glm::mat3 J =
+      glm::mat3(focal_x / t.z, 0.0f, -(focal_x * t.x) / (t.z * t.z), 0.0f,
+                focal_y / t.z, -(focal_y * t.y) / (t.z * t.z), 0, 0, 0);
+  glm::mat3 W = glm::mat3(viewmatrix[0], viewmatrix[4], viewmatrix[8],
+                          viewmatrix[1], viewmatrix[5], viewmatrix[9],
+                          viewmatrix[2], viewmatrix[6], viewmatrix[10]);
+  glm::mat3 T = W * J;
+  glm::mat3 Vrk = glm::mat3(cov3D[0], cov3D[1], cov3D[2], cov3D[1], cov3D[3],
+                            cov3D[4], cov3D[2], cov3D[4], cov3D[5]);
+  glm::mat3 cov = glm::transpose(T) * glm::transpose(Vrk) * T;
+  // Apply low-pass filter: every Gaussian should be at least
+  // one pixel wide/high. Discard 3rd row and column.
+  cov[0][0] += 0.3f;
+  cov[1][1] += 0.3f;
+  return {float(cov[0][0]), float(cov[0][1]), float(cov[1][1])};
+}
+// Forward method for converting scale and rotation properties of each
+// Gaussian to a 3D covariance matrix in world space. Also takes care
+// of quaternion normalization.
+__device__ void computeCov3D(const glm::vec3 scale, float mod,
+                             const glm::vec4 rot, float *cov3D) {
+  // Create scaling matrix
+  glm::mat3 S = glm::mat3(1.0f);
+  S[0][0] = mod * scale.x;
+  S[1][1] = mod * scale.y;
+  S[2][2] = mod * scale.z;
+  // Normalize quaternion to get valid rotation
+  glm::vec4 q = rot; // / glm::length(rot);
+  float r = q.x;
+  float x = q.y;
+  float y = q.z;
+  float z = q.w;
+  // Compute rotation matrix from quaternion
+  glm::mat3 R = glm::mat3(1.f - 2.f * (y * y + z * z), 2.f * (x * y - r * z),
+                          2.f * (x * z + r * y), 2.f * (x * y + r * z),
+                          1.f - 2.f * (x * x + z * z), 2.f * (y * z - r * x),
+                          2.f * (x * z - r * y), 2.f * (y * z + r * x),
+                          1.f - 2.f * (x * x + y * y));
+  glm::mat3 M = S * R;
+  // Compute 3D world covariance matrix Sigma
+  glm::mat3 Sigma = glm::transpose(M) * M;
+  // Covariance is symmetric, only store upper right
+  cov3D[0] = Sigma[0][0];
+  cov3D[1] = Sigma[0][1];
+  cov3D[2] = Sigma[0][2];
+  cov3D[3] = Sigma[1][1];
+  cov3D[4] = Sigma[1][2];
+  cov3D[5] = Sigma[2][2];
+}
+// Perform initial steps for each Gaussian prior to rasterization.
+template <int C>
+__global__ void
+preprocessCUDA(int P, int D, int M, const float *orig_points,
+               const glm::vec3 *scales, const float scale_modifier,
+               const glm::vec4 *rotations, const float *opacities,
+               const float *shs, bool *clamped, const float *cov3D_precomp,
+               const float *colors_precomp, const float *viewmatrix,
+               const float *projmatrix, const glm::vec3 *cam_pos, const int W,
+               int H, const float tan_fovx, float tan_fovy, const float focal_x,
+               float focal_y, int *radii, float2 *points_xy_image,
+               float *depths, float *cov3Ds, float *rgb, float4 *conic_opacity,
+               const dim3 grid, uint32_t *tiles_touched, bool prefiltered) {
+  auto idx = cg::this_grid().thread_rank();
+  if (idx >= P)
+    return;
+  // Initialize radius and touched tiles to 0. If this isn't changed,
+  // this Gaussian will not be processed further.
+  radii[idx] = 0;
+  tiles_touched[idx] = 0;
+  // Perform near culling, quit if outside.
+  float3 p_view;
+  if (!in_frustum(idx, orig_points, viewmatrix, projmatrix, prefiltered,
+                  p_view))
+    return;
+  // Transform point by projecting
+  float3 p_orig = {orig_points[3 * idx], orig_points[3 * idx + 1],
+                   orig_points[3 * idx + 2]};
+  float4 p_hom = transformPoint4x4(p_orig, projmatrix);
+  float p_w = 1.0f / (p_hom.w + 0.0000001f);
+  float3 p_proj = {p_hom.x * p_w, p_hom.y * p_w, p_hom.z * p_w};
+  // If 3D covariance matrix is precomputed, use it, otherwise compute
+  // from scaling and rotation parameters.
+  const float *cov3D;
+  if (cov3D_precomp != nullptr) {
+    cov3D = cov3D_precomp + idx * 6;
+  } else {
+    computeCov3D(scales[idx], scale_modifier, rotations[idx], cov3Ds + idx * 6);
+    cov3D = cov3Ds + idx * 6;
+  }
+  // Compute 2D screen-space covariance matrix
+  float3 cov = computeCov2D(p_orig, focal_x, focal_y, tan_fovx, tan_fovy, cov3D,
+                            viewmatrix);
+  // Invert covariance (EWA algorithm)
+  float det = (cov.x * cov.z - cov.y * cov.y);
+  if (det == 0.0f)
+    return;
+  float det_inv = 1.f / det;
+  float3 conic = {cov.z * det_inv, -cov.y * det_inv, cov.x * det_inv};
+  // Compute extent in screen space (by finding eigenvalues of
+  // 2D covariance matrix). Use extent to compute a bounding rectangle
+  // of screen-space tiles that this Gaussian overlaps with. Quit if
+  // rectangle covers 0 tiles.
+  float mid = 0.5f * (cov.x + cov.z);
+  float lambda1 = mid + sqrt(max(0.1f, mid * mid - det));
+  float lambda2 = mid - sqrt(max(0.1f, mid * mid - det));
+  float my_radius = ceil(3.f * sqrt(max(lambda1, lambda2)));
+  float2 point_image = {ndc2Pix(p_proj.x, W), ndc2Pix(p_proj.y, H)};
+  uint2 rect_min, rect_max;
+  getRect(point_image, my_radius, rect_min, rect_max, grid);
+  if ((rect_max.x - rect_min.x) * (rect_max.y - rect_min.y) == 0)
+    return;
+  // If colors have been precomputed, use them, otherwise convert
+  // spherical harmonics coefficients to RGB color.
+  if (colors_precomp == nullptr) {
+    glm::vec3 result = computeColorFromSH(idx, D, M, (glm::vec3 *)orig_points,
+                                          *cam_pos, shs, clamped);
+    rgb[idx * C + 0] = result.x;
+    rgb[idx * C + 1] = result.y;
+    rgb[idx * C + 2] = result.z;
+  }
+  // Store some useful helper data for the next steps.
+  depths[idx] = p_view.z;
+  radii[idx] = my_radius;
+  points_xy_image[idx] = point_image;
+  // Inverse 2D covariance and opacity neatly pack into one float4
+  conic_opacity[idx] = {conic.x, conic.y, conic.z, opacities[idx]};
+  tiles_touched[idx] = (rect_max.y - rect_min.y) * (rect_max.x - rect_min.x);
+}
+// Main rasterization method. Collaboratively works on one tile per
+// block, each thread treats one pixel. Alternates between fetching
+// and rasterizing data.
+template <uint32_t CHANNELS>
+__global__ void __launch_bounds__(BLOCK_X *BLOCK_Y)
+    renderCUDA(const uint2 *__restrict__ ranges,
+               const uint32_t *__restrict__ point_list, int W, int H,
+               const float2 *__restrict__ points_xy_image,
+               const float *__restrict__ features,
+               const float4 *__restrict__ conic_opacity,
+               float *__restrict__ final_T, uint32_t *__restrict__ n_contrib,
+               const float *__restrict__ bg_color,
+               float *__restrict__ out_color) {
+  // Identify current tile and associated min/max pixel range.
+  auto block = cg::this_thread_block();
+  uint32_t horizontal_blocks = (W + BLOCK_X - 1) / BLOCK_X;
+  uint2 pix_min = {block.group_index().x * BLOCK_X,
+                   block.group_index().y * BLOCK_Y};
+  uint2 pix_max = {min(pix_min.x + BLOCK_X, W), min(pix_min.y + BLOCK_Y, H)};
+  uint2 pix = {pix_min.x + block.thread_index().x,
+               pix_min.y + block.thread_index().y};
+  uint32_t pix_id = W * pix.y + pix.x;
+  float2 pixf = {(float)pix.x, (float)pix.y};
+  // Check if this thread is associated with a valid pixel or outside.
+  bool inside = pix.x < W && pix.y < H;
+  // Done threads can help with fetching, but don't rasterize
+  bool done = !inside;
+  // Load start/end range of IDs to process in bit sorted list.
+  uint2 range =
+      ranges[block.group_index().y * horizontal_blocks + block.group_index().x];
+  const int rounds = ((range.y - range.x + BLOCK_SIZE - 1) / BLOCK_SIZE);
+  int toDo = range.y - range.x;
+  // Allocate storage for batches of collectively fetched data.
+  __shared__ int collected_id[BLOCK_SIZE];
+  __shared__ float2 collected_xy[BLOCK_SIZE];
+  __shared__ float4 collected_conic_opacity[BLOCK_SIZE];
+  // Initialize helper variables
+  float T = 1.0f;
+  uint32_t contributor = 0;
+  uint32_t last_contributor = 0;
+  float C[CHANNELS] = {0};
+  // Iterate over batches until all done or range is complete
+  for (int i = 0; i < rounds; i++, toDo -= BLOCK_SIZE) {
+    // End if entire block votes that it is done rasterizing
+    int num_done = __syncthreads_count(done);
+    if (num_done == BLOCK_SIZE)
+      break;
+    // Collectively fetch per-Gaussian data from global to shared
+    int progress = i * BLOCK_SIZE + block.thread_rank();
+    if (range.x + progress < range.y) {
+      int coll_id = point_list[range.x + progress];
+      collected_id[block.thread_rank()] = coll_id;
+      collected_xy[block.thread_rank()] = points_xy_image[coll_id];
+      collected_conic_opacity[block.thread_rank()] = conic_opacity[coll_id];
+    }
+    block.sync();
+    // Iterate over current batch
+    for (int j = 0; !done && j < min(BLOCK_SIZE, toDo); j++) {
+      // Keep track of current position in range
+      contributor++;
+      // Resample using conic matrix (cf. "Surface
+      // Splatting" by Zwicker et al., 2001)
+      float2 xy = collected_xy[j];
+      float2 d = {xy.x - pixf.x, xy.y - pixf.y};
+      float4 con_o = collected_conic_opacity[j];
+      float power = -0.5f * (con_o.x * d.x * d.x + con_o.z * d.y * d.y) -
+                    con_o.y * d.x * d.y;
+      if (power > 0.0f)
+        continue;
+      // Eq. (2) from 3D Gaussian splatting paper.
+      // Obtain alpha by multiplying with Gaussian opacity
+      // and its exponential falloff from mean.
+      // Avoid numerical instabilities (see paper appendix).
+      float alpha = min(0.99f, con_o.w * exp(power));
+      if (alpha < 1.0f / 255.0f)
+        continue;
+      float test_T = T * (1 - alpha);
+      if (test_T < 0.0001f) {
+        done = true;
+        continue;
+      }
+      // Eq. (3) from 3D Gaussian splatting paper.
+      for (int ch = 0; ch < CHANNELS; ch++)
+        C[ch] += features[collected_id[j] * CHANNELS + ch] * alpha * T;
+      T = test_T;
+      // Keep track of last range entry to update this
+      // pixel.
+      last_contributor = contributor;
+    }
+  }
+  // All threads that treat valid pixel write out their final
+  // rendering data to the frame and auxiliary buffers.
+  if (inside) {
+    final_T[pix_id] = T;
+    n_contrib[pix_id] = last_contributor;
+    for (int ch = 0; ch < CHANNELS; ch++)
+      out_color[ch * H * W + pix_id] = C[ch] + T * bg_color[ch];
+  }
+}
+void FORWARD::render(const dim3 grid, dim3 block, const uint2 *ranges,
+                     const uint32_t *point_list, int W, int H,
+                     const float2 *means2D, const float *colors,
+                     const float4 *conic_opacity, float *final_T,
+                     uint32_t *n_contrib, const float *bg_color,
+                     float *out_color) {
+  renderCUDA<NUM_CHANNELS><<<grid, block>>>(ranges, point_list, W, H, means2D,
+                                            colors, conic_opacity, final_T,
+                                            n_contrib, bg_color, out_color);
+}
+void FORWARD::preprocess(int P, int D, int M, const float *means3D,
+                         const glm::vec3 *scales, const float scale_modifier,
+                         const glm::vec4 *rotations, const float *opacities,
+                         const float *shs, bool *clamped,
+                         const float *cov3D_precomp,
+                         const float *colors_precomp, const float *viewmatrix,
+                         const float *projmatrix, const glm::vec3 *cam_pos,
+                         const int W, int H, const float focal_x, float focal_y,
+                         const float tan_fovx, float tan_fovy, int *radii,
+                         float2 *means2D, float *depths, float *cov3Ds,
+                         float *rgb, float4 *conic_opacity, const dim3 grid,
+                         uint32_t *tiles_touched, bool prefiltered) {
+  preprocessCUDA<NUM_CHANNELS><<<(P + 255) / 256, 256>>>(
+      P, D, M, means3D, scales, scale_modifier, rotations, opacities, shs,
+      clamped, cov3D_precomp, colors_precomp, viewmatrix, projmatrix, cam_pos,
+      W, H, tan_fovx, tan_fovy, focal_x, focal_y, radii, means2D, depths,
+      cov3Ds, rgb, conic_opacity, grid, tiles_touched, prefiltered);
+}

gaussiancity/extensions/diff_gaussian_rasterization/cuda_rasterizer/forward.h ADDED Viewed

	@@ -0,0 +1,43 @@

+/*
+ * Copyright (C) 2023, Inria
+ * GRAPHDECO research group, https://team.inria.fr/graphdeco
+ * All rights reserved.
+ *
+ * This software is free for non-commercial, research and evaluation use
+ * under the terms of the LICENSE.md file.
+ *
+ * For inquiries contact  george.drettakis@inria.fr
+ */
+#ifndef CUDA_RASTERIZER_FORWARD_H_INCLUDED
+#define CUDA_RASTERIZER_FORWARD_H_INCLUDED
+#include "cuda_runtime.h"
+#include "device_launch_parameters.h"
+#include <cuda.h>
+#define GLM_FORCE_CUDA
+#include <glm/glm.hpp>
+namespace FORWARD {
+// Perform initial steps for each Gaussian prior to rasterization.
+void preprocess(int P, int D, int M, const float *orig_points,
+                const glm::vec3 *scales, const float scale_modifier,
+                const glm::vec4 *rotations, const float *opacities,
+                const float *shs, bool *clamped, const float *cov3D_precomp,
+                const float *colors_precomp, const float *viewmatrix,
+                const float *projmatrix, const glm::vec3 *cam_pos, const int W,
+                int H, const float focal_x, float focal_y, const float tan_fovx,
+                float tan_fovy, int *radii, float2 *points_xy_image,
+                float *depths, float *cov3Ds, float *colors,
+                float4 *conic_opacity, const dim3 grid, uint32_t *tiles_touched,
+                bool prefiltered);
+// Main rasterization method.
+void render(const dim3 grid, dim3 block, const uint2 *ranges,
+            const uint32_t *point_list, int W, int H,
+            const float2 *points_xy_image, const float *features,
+            const float4 *conic_opacity, float *final_T, uint32_t *n_contrib,
+            const float *bg_color, float *out_color);
+} // namespace FORWARD
+#endif

gaussiancity/extensions/diff_gaussian_rasterization/cuda_rasterizer/rasterizer.h ADDED Viewed

	@@ -0,0 +1,52 @@

+/*
+ * Copyright (C) 2023, Inria
+ * GRAPHDECO research group, https://team.inria.fr/graphdeco
+ * All rights reserved.
+ *
+ * This software is free for non-commercial, research and evaluation use
+ * under the terms of the LICENSE.md file.
+ *
+ * For inquiries contact  george.drettakis@inria.fr
+ */
+#ifndef CUDA_RASTERIZER_H_INCLUDED
+#define CUDA_RASTERIZER_H_INCLUDED
+#include <functional>
+#include <vector>
+namespace CudaRasterizer {
+class Rasterizer {
+public:
+  static void markVisible(int P, float *means3D, float *viewmatrix,
+                          float *projmatrix, bool *present);
+  static int forward(std::function<char *(size_t)> geometryBuffer,
+                     std::function<char *(size_t)> binningBuffer,
+                     std::function<char *(size_t)> imageBuffer, const int P,
+                     int D, int M, const float *background, const int width,
+                     int height, const float *means3D, const float *shs,
+                     const float *colors_precomp, const float *opacities,
+                     const float *scales, const float scale_modifier,
+                     const float *rotations, const float *cov3D_precomp,
+                     const float *viewmatrix, const float *projmatrix,
+                     const float *cam_pos, const float tan_fovx, float tan_fovy,
+                     const bool prefiltered, float *out_color,
+                     int *radii = nullptr, bool debug = false);
+  static void
+  backward(const int P, int D, int M, int R, const float *background,
+           const int width, int height, const float *means3D, const float *shs,
+           const float *colors_precomp, const float *scales,
+           const float scale_modifier, const float *rotations,
+           const float *cov3D_precomp, const float *viewmatrix,
+           const float *projmatrix, const float *campos, const float tan_fovx,
+           float tan_fovy, const int *radii, char *geom_buffer,
+           char *binning_buffer, char *image_buffer, const float *dL_dpix,
+           float *dL_dmean2D, float *dL_dconic, float *dL_dopacity,
+           float *dL_dcolor, float *dL_dmean3D, float *dL_dcov3D, float *dL_dsh,
+           float *dL_dscale, float *dL_drot, bool debug);
+};
+}; // namespace CudaRasterizer
+#endif

gaussiancity/extensions/diff_gaussian_rasterization/cuda_rasterizer/rasterizer_impl.cu ADDED Viewed

	@@ -0,0 +1,339 @@

+/*
+ * Copyright (C) 2023, Inria
+ * GRAPHDECO research group, https://team.inria.fr/graphdeco
+ * All rights reserved.
+ *
+ * This software is free for non-commercial, research and evaluation use
+ * under the terms of the LICENSE.md file.
+ *
+ * For inquiries contact  george.drettakis@inria.fr
+ */
+#include "cuda_runtime.h"
+#include "device_launch_parameters.h"
+#include "rasterizer_impl.h"
+#include <algorithm>
+#include <cub/cub.cuh>
+#include <cub/device/device_radix_sort.cuh>
+#include <cuda.h>
+#include <fstream>
+#include <iostream>
+#include <numeric>
+#define GLM_FORCE_CUDA
+#include <glm/glm.hpp>
+#include <cooperative_groups.h>
+#include <cooperative_groups/reduce.h>
+namespace cg = cooperative_groups;
+#include "auxiliary.h"
+#include "backward.h"
+#include "forward.h"
+// Helper function to find the next-highest bit of the MSB
+// on the CPU.
+uint32_t getHigherMsb(uint32_t n) {
+  uint32_t msb = sizeof(n) * 4;
+  uint32_t step = msb;
+  while (step > 1) {
+    step /= 2;
+    if (n >> msb)
+      msb += step;
+    else
+      msb -= step;
+  }
+  if (n >> msb)
+    msb++;
+  return msb;
+}
+// Wrapper method to call auxiliary coarse frustum containment test.
+// Mark all Gaussians that pass it.
+__global__ void checkFrustum(int P, const float *orig_points,
+                             const float *viewmatrix, const float *projmatrix,
+                             bool *present) {
+  auto idx = cg::this_grid().thread_rank();
+  if (idx >= P)
+    return;
+  float3 p_view;
+  present[idx] =
+      in_frustum(idx, orig_points, viewmatrix, projmatrix, false, p_view);
+}
+// Generates one key/value pair for all Gaussian / tile overlaps.
+// Run once per Gaussian (1:N mapping).
+__global__ void duplicateWithKeys(int P, const float2 *points_xy,
+                                  const float *depths, const uint32_t *offsets,
+                                  uint64_t *gaussian_keys_unsorted,
+                                  uint32_t *gaussian_values_unsorted,
+                                  int *radii, dim3 grid) {
+  auto idx = cg::this_grid().thread_rank();
+  if (idx >= P)
+    return;
+  // Generate no key/value pair for invisible Gaussians
+  if (radii[idx] > 0) {
+    // Find this Gaussian's offset in buffer for writing keys/values.
+    uint32_t off = (idx == 0) ? 0 : offsets[idx - 1];
+    uint2 rect_min, rect_max;
+    getRect(points_xy[idx], radii[idx], rect_min, rect_max, grid);
+    // For each tile that the bounding rect overlaps, emit a
+    // key/value pair. The key is |  tile ID  |      depth      |,
+    // and the value is the ID of the Gaussian. Sorting the values
+    // with this key yields Gaussian IDs in a list, such that they
+    // are first sorted by tile and then by depth.
+    for (int y = rect_min.y; y < rect_max.y; y++) {
+      for (int x = rect_min.x; x < rect_max.x; x++) {
+        uint64_t key = y * grid.x + x;
+        key <<= 32;
+        key |= *((uint32_t *)&depths[idx]);
+        gaussian_keys_unsorted[off] = key;
+        gaussian_values_unsorted[off] = idx;
+        off++;
+      }
+    }
+  }
+}
+// Check keys to see if it is at the start/end of one tile's range in
+// the full sorted list. If yes, write start/end of this tile.
+// Run once per instanced (duplicated) Gaussian ID.
+__global__ void identifyTileRanges(int L, uint64_t *point_list_keys,
+                                   uint2 *ranges) {
+  auto idx = cg::this_grid().thread_rank();
+  if (idx >= L)
+    return;
+  // Read tile ID from key. Update start/end of tile range if at limit.
+  uint64_t key = point_list_keys[idx];
+  uint32_t currtile = key >> 32;
+  if (idx == 0)
+    ranges[currtile].x = 0;
+  else {
+    uint32_t prevtile = point_list_keys[idx - 1] >> 32;
+    if (currtile != prevtile) {
+      ranges[prevtile].y = idx;
+      ranges[currtile].x = idx;
+    }
+  }
+  if (idx == L - 1)
+    ranges[currtile].y = L;
+}
+// Mark Gaussians as visible/invisible, based on view frustum testing
+void CudaRasterizer::Rasterizer::markVisible(int P, float *means3D,
+                                             float *viewmatrix,
+                                             float *projmatrix, bool *present) {
+  checkFrustum<<<(P + 255) / 256, 256>>>(P, means3D, viewmatrix, projmatrix,
+                                         present);
+}
+CudaRasterizer::GeometryState
+CudaRasterizer::GeometryState::fromChunk(char *&chunk, size_t P) {
+  GeometryState geom;
+  obtain(chunk, geom.depths, P, 128);
+  obtain(chunk, geom.clamped, P * 3, 128);
+  obtain(chunk, geom.internal_radii, P, 128);
+  obtain(chunk, geom.means2D, P, 128);
+  obtain(chunk, geom.cov3D, P * 6, 128);
+  obtain(chunk, geom.conic_opacity, P, 128);
+  obtain(chunk, geom.rgb, P * 3, 128);
+  obtain(chunk, geom.tiles_touched, P, 128);
+  cub::DeviceScan::InclusiveSum(nullptr, geom.scan_size, geom.tiles_touched,
+                                geom.tiles_touched, P);
+  obtain(chunk, geom.scanning_space, geom.scan_size, 128);
+  obtain(chunk, geom.point_offsets, P, 128);
+  return geom;
+}
+CudaRasterizer::ImageState CudaRasterizer::ImageState::fromChunk(char *&chunk,
+                                                                 size_t N) {
+  ImageState img;
+  obtain(chunk, img.accum_alpha, N, 128);
+  obtain(chunk, img.n_contrib, N, 128);
+  obtain(chunk, img.ranges, N, 128);
+  return img;
+}
+CudaRasterizer::BinningState
+CudaRasterizer::BinningState::fromChunk(char *&chunk, size_t P) {
+  BinningState binning;
+  obtain(chunk, binning.point_list, P, 128);
+  obtain(chunk, binning.point_list_unsorted, P, 128);
+  obtain(chunk, binning.point_list_keys, P, 128);
+  obtain(chunk, binning.point_list_keys_unsorted, P, 128);
+  cub::DeviceRadixSort::SortPairs(
+      nullptr, binning.sorting_size, binning.point_list_keys_unsorted,
+      binning.point_list_keys, binning.point_list_unsorted, binning.point_list,
+      P);
+  obtain(chunk, binning.list_sorting_space, binning.sorting_size, 128);
+  return binning;
+}
+// Forward rendering procedure for differentiable rasterization
+// of Gaussians.
+int CudaRasterizer::Rasterizer::forward(
+    std::function<char *(size_t)> geometryBuffer,
+    std::function<char *(size_t)> binningBuffer,
+    std::function<char *(size_t)> imageBuffer, const int P, int D, int M,
+    const float *background, const int width, int height, const float *means3D,
+    const float *shs, const float *colors_precomp, const float *opacities,
+    const float *scales, const float scale_modifier, const float *rotations,
+    const float *cov3D_precomp, const float *viewmatrix,
+    const float *projmatrix, const float *cam_pos, const float tan_fovx,
+    float tan_fovy, const bool prefiltered, float *out_color, int *radii,
+    bool debug) {
+  const float focal_y = height / (2.0f * tan_fovy);
+  const float focal_x = width / (2.0f * tan_fovx);
+  size_t chunk_size = required<GeometryState>(P);
+  char *chunkptr = geometryBuffer(chunk_size);
+  GeometryState geomState = GeometryState::fromChunk(chunkptr, P);
+  if (radii == nullptr) {
+    radii = geomState.internal_radii;
+  }
+  dim3 tile_grid((width + BLOCK_X - 1) / BLOCK_X,
+                 (height + BLOCK_Y - 1) / BLOCK_Y, 1);
+  dim3 block(BLOCK_X, BLOCK_Y, 1);
+  // Dynamically resize image-based auxiliary buffers during training
+  size_t img_chunk_size = required<ImageState>(width * height);
+  char *img_chunkptr = imageBuffer(img_chunk_size);
+  ImageState imgState = ImageState::fromChunk(img_chunkptr, width * height);
+  if (NUM_CHANNELS != 3 && colors_precomp == nullptr) {
+    throw std::runtime_error(
+        "For non-RGB, provide precomputed Gaussian colors!");
+  }
+  // Run preprocessing per-Gaussian (transformation, bounding, conversion of SHs
+  // to RGB)
+  CHECK_CUDA(FORWARD::preprocess(
+                 P, D, M, means3D, (glm::vec3 *)scales, scale_modifier,
+                 (glm::vec4 *)rotations, opacities, shs, geomState.clamped,
+                 cov3D_precomp, colors_precomp, viewmatrix, projmatrix,
+                 (glm::vec3 *)cam_pos, width, height, focal_x, focal_y,
+                 tan_fovx, tan_fovy, radii, geomState.means2D, geomState.depths,
+                 geomState.cov3D, geomState.rgb, geomState.conic_opacity,
+                 tile_grid, geomState.tiles_touched, prefiltered),
+             debug)
+  // Compute prefix sum over full list of touched tile counts by Gaussians
+  // E.g., [2, 3, 0, 2, 1] -> [2, 5, 5, 7, 8]
+  CHECK_CUDA(cub::DeviceScan::InclusiveSum(
+                 geomState.scanning_space, geomState.scan_size,
+                 geomState.tiles_touched, geomState.point_offsets, P),
+             debug)
+  // Retrieve total number of Gaussian instances to launch and resize aux
+  // buffers
+  int num_rendered;
+  CHECK_CUDA(cudaMemcpy(&num_rendered, geomState.point_offsets + P - 1,
+                        sizeof(int), cudaMemcpyDeviceToHost),
+             debug);
+  size_t binning_chunk_size = required<BinningState>(num_rendered);
+  char *binning_chunkptr = binningBuffer(binning_chunk_size);
+  BinningState binningState =
+      BinningState::fromChunk(binning_chunkptr, num_rendered);
+  // For each instance to be rendered, produce adequate [ tile | depth ] key
+  // and corresponding dublicated Gaussian indices to be sorted
+  duplicateWithKeys<<<(P + 255) / 256, 256>>>(
+      P, geomState.means2D, geomState.depths, geomState.point_offsets,
+      binningState.point_list_keys_unsorted, binningState.point_list_unsorted,
+      radii, tile_grid) CHECK_CUDA(, debug)
+      int bit = getHigherMsb(tile_grid.x * tile_grid.y);
+  // Sort complete list of (duplicated) Gaussian indices by keys
+  CHECK_CUDA(cub::DeviceRadixSort::SortPairs(
+                 binningState.list_sorting_space, binningState.sorting_size,
+                 binningState.point_list_keys_unsorted,
+                 binningState.point_list_keys, binningState.point_list_unsorted,
+                 binningState.point_list, num_rendered, 0, 32 + bit),
+             debug)
+  CHECK_CUDA(
+      cudaMemset(imgState.ranges, 0, tile_grid.x * tile_grid.y * sizeof(uint2)),
+      debug);
+  // Identify start and end of per-tile workloads in sorted list
+  if (num_rendered > 0)
+    identifyTileRanges<<<(num_rendered + 255) / 256, 256>>>(
+        num_rendered, binningState.point_list_keys, imgState.ranges);
+  CHECK_CUDA(, debug)
+  // Let each tile blend its range of Gaussians independently in parallel
+  const float *feature_ptr =
+      colors_precomp != nullptr ? colors_precomp : geomState.rgb;
+  CHECK_CUDA(FORWARD::render(tile_grid, block, imgState.ranges,
+                             binningState.point_list, width, height,
+                             geomState.means2D, feature_ptr,
+                             geomState.conic_opacity, imgState.accum_alpha,
+                             imgState.n_contrib, background, out_color),
+             debug)
+  return num_rendered;
+}
+// Produce necessary gradients for optimization, corresponding
+// to forward render pass
+void CudaRasterizer::Rasterizer::backward(
+    const int P, int D, int M, int R, const float *background, const int width,
+    int height, const float *means3D, const float *shs,
+    const float *colors_precomp, const float *scales,
+    const float scale_modifier, const float *rotations,
+    const float *cov3D_precomp, const float *viewmatrix,
+    const float *projmatrix, const float *campos, const float tan_fovx,
+    float tan_fovy, const int *radii, char *geom_buffer, char *binning_buffer,
+    char *img_buffer, const float *dL_dpix, float *dL_dmean2D, float *dL_dconic,
+    float *dL_dopacity, float *dL_dcolor, float *dL_dmean3D, float *dL_dcov3D,
+    float *dL_dsh, float *dL_dscale, float *dL_drot, bool debug) {
+  GeometryState geomState = GeometryState::fromChunk(geom_buffer, P);
+  BinningState binningState = BinningState::fromChunk(binning_buffer, R);
+  ImageState imgState = ImageState::fromChunk(img_buffer, width * height);
+  if (radii == nullptr) {
+    radii = geomState.internal_radii;
+  }
+  const float focal_y = height / (2.0f * tan_fovy);
+  const float focal_x = width / (2.0f * tan_fovx);
+  const dim3 tile_grid((width + BLOCK_X - 1) / BLOCK_X,
+                       (height + BLOCK_Y - 1) / BLOCK_Y, 1);
+  const dim3 block(BLOCK_X, BLOCK_Y, 1);
+  // Compute loss gradients w.r.t. 2D mean position, conic matrix,
+  // opacity and RGB of Gaussians from per-pixel loss gradients.
+  // If we were given precomputed colors and not SHs, use them.
+  const float *color_ptr =
+      (colors_precomp != nullptr) ? colors_precomp : geomState.rgb;
+  CHECK_CUDA(BACKWARD::render(
+                 tile_grid, block, imgState.ranges, binningState.point_list,
+                 width, height, background, geomState.means2D,
+                 geomState.conic_opacity, color_ptr, imgState.accum_alpha,
+                 imgState.n_contrib, dL_dpix, (float3 *)dL_dmean2D,
+                 (float4 *)dL_dconic, dL_dopacity, dL_dcolor),
+             debug)
+  // Take care of the rest of preprocessing. Was the precomputed covariance
+  // given to us or a scales/rot pair? If precomputed, pass that. If not,
+  // use the one we computed ourselves.
+  const float *cov3D_ptr =
+      (cov3D_precomp != nullptr) ? cov3D_precomp : geomState.cov3D;
+  CHECK_CUDA(BACKWARD::preprocess(
+                 P, D, M, (float3 *)means3D, radii, shs, geomState.clamped,
+                 (glm::vec3 *)scales, (glm::vec4 *)rotations, scale_modifier,
+                 cov3D_ptr, viewmatrix, projmatrix, focal_x, focal_y, tan_fovx,
+                 tan_fovy, (glm::vec3 *)campos, (float3 *)dL_dmean2D, dL_dconic,
+                 (glm::vec3 *)dL_dmean3D, dL_dcolor, dL_dcov3D, dL_dsh,
+                 (glm::vec3 *)dL_dscale, (glm::vec4 *)dL_drot),
+             debug)
+}

gaussiancity/extensions/diff_gaussian_rasterization/cuda_rasterizer/rasterizer_impl.h ADDED Viewed

	@@ -0,0 +1,70 @@

+/*
+ * Copyright (C) 2023, Inria
+ * GRAPHDECO research group, https://team.inria.fr/graphdeco
+ * All rights reserved.
+ *
+ * This software is free for non-commercial, research and evaluation use
+ * under the terms of the LICENSE.md file.
+ *
+ * For inquiries contact  george.drettakis@inria.fr
+ */
+#pragma once
+#include "rasterizer.h"
+#include <cuda_runtime_api.h>
+#include <iostream>
+#include <vector>
+namespace CudaRasterizer {
+template <typename T>
+static void obtain(char *&chunk, T *&ptr, std::size_t count,
+                   std::size_t alignment) {
+  std::size_t offset =
+      (reinterpret_cast<std::uintptr_t>(chunk) + alignment - 1) &
+      ~(alignment - 1);
+  ptr = reinterpret_cast<T *>(offset);
+  chunk = reinterpret_cast<char *>(ptr + count);
+}
+struct GeometryState {
+  size_t scan_size;
+  float *depths;
+  char *scanning_space;
+  bool *clamped;
+  int *internal_radii;
+  float2 *means2D;
+  float *cov3D;
+  float4 *conic_opacity;
+  float *rgb;
+  uint32_t *point_offsets;
+  uint32_t *tiles_touched;
+  static GeometryState fromChunk(char *&chunk, size_t P);
+};
+struct ImageState {
+  uint2 *ranges;
+  uint32_t *n_contrib;
+  float *accum_alpha;
+  static ImageState fromChunk(char *&chunk, size_t N);
+};
+struct BinningState {
+  size_t sorting_size;
+  uint64_t *point_list_keys_unsorted;
+  uint64_t *point_list_keys;
+  uint32_t *point_list_unsorted;
+  uint32_t *point_list;
+  char *list_sorting_space;
+  static BinningState fromChunk(char *&chunk, size_t P);
+};
+template <typename T> size_t required(size_t P) {
+  char *size = nullptr;
+  T::fromChunk(size, P);
+  return ((size_t)size) + 128;
+}
+}; // namespace CudaRasterizer

gaussiancity/extensions/diff_gaussian_rasterization/rasterize_points.cu ADDED Viewed

	@@ -0,0 +1,173 @@

+/*
+ * Copyright (C) 2023, Inria
+ * GRAPHDECO research group, https://team.inria.fr/graphdeco
+ * All rights reserved.
+ *
+ * This software is free for non-commercial, research and evaluation use
+ * under the terms of the LICENSE.md file.
+ *
+ * For inquiries contact  george.drettakis@inria.fr
+ */
+#include "cuda_rasterizer/config.h"
+#include "cuda_rasterizer/rasterizer.h"
+#include <cstdio>
+#include <cuda_runtime_api.h>
+#include <fstream>
+#include <functional>
+#include <iostream>
+#include <math.h>
+#include <memory>
+#include <sstream>
+#include <stdio.h>
+#include <string>
+#include <torch/extension.h>
+#include <tuple>
+std::function<char *(size_t N)> resizeFunctional(torch::Tensor &t) {
+  auto lambda = [&t](size_t N) {
+    t.resize_({(long long)N});
+    return reinterpret_cast<char *>(t.contiguous().data_ptr());
+  };
+  return lambda;
+}
+std::tuple<int, torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor,
+           torch::Tensor>
+RasterizeGaussiansCUDA(
+    const torch::Tensor &background, const torch::Tensor &means3D,
+    const torch::Tensor &colors, const torch::Tensor &opacity,
+    const torch::Tensor &scales, const torch::Tensor &rotations,
+    const float scale_modifier, const torch::Tensor &cov3D_precomp,
+    const torch::Tensor &viewmatrix, const torch::Tensor &projmatrix,
+    const float tan_fovx, const float tan_fovy, const int image_height,
+    const int image_width, const torch::Tensor &sh, const int degree,
+    const torch::Tensor &campos, const bool prefiltered, const bool debug) {
+  if (means3D.ndimension() != 2 || means3D.size(1) != 3) {
+    AT_ERROR("means3D must have dimensions (num_points, 3)");
+  }
+  const int P = means3D.size(0);
+  const int H = image_height;
+  const int W = image_width;
+  auto int_opts = means3D.options().dtype(torch::kInt32);
+  auto float_opts = means3D.options().dtype(torch::kFloat32);
+  torch::Tensor out_color = torch::full({NUM_CHANNELS, H, W}, 0.0, float_opts);
+  torch::Tensor radii =
+      torch::full({P}, 0, means3D.options().dtype(torch::kInt32));
+  torch::Device device(torch::kCUDA);
+  torch::TensorOptions options(torch::kByte);
+  torch::Tensor geomBuffer = torch::empty({0}, options.device(device));
+  torch::Tensor binningBuffer = torch::empty({0}, options.device(device));
+  torch::Tensor imgBuffer = torch::empty({0}, options.device(device));
+  std::function<char *(size_t)> geomFunc = resizeFunctional(geomBuffer);
+  std::function<char *(size_t)> binningFunc = resizeFunctional(binningBuffer);
+  std::function<char *(size_t)> imgFunc = resizeFunctional(imgBuffer);
+  int rendered = 0;
+  if (P != 0) {
+    int M = 0;
+    if (sh.size(0) != 0) {
+      M = sh.size(1);
+    }
+    rendered = CudaRasterizer::Rasterizer::forward(
+        geomFunc, binningFunc, imgFunc, P, degree, M,
+        background.contiguous().data<float>(), W, H,
+        means3D.contiguous().data<float>(), sh.contiguous().data_ptr<float>(),
+        colors.contiguous().data<float>(), opacity.contiguous().data<float>(),
+        scales.contiguous().data_ptr<float>(), scale_modifier,
+        rotations.contiguous().data_ptr<float>(),
+        cov3D_precomp.contiguous().data<float>(),
+        viewmatrix.contiguous().data<float>(),
+        projmatrix.contiguous().data<float>(),
+        campos.contiguous().data<float>(), tan_fovx, tan_fovy, prefiltered,
+        out_color.contiguous().data<float>(), radii.contiguous().data<int>(),
+        debug);
+  }
+  return std::make_tuple(rendered, out_color, radii, geomBuffer, binningBuffer,
+                         imgBuffer);
+}
+std::tuple<torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor,
+           torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor>
+RasterizeGaussiansBackwardCUDA(
+    const torch::Tensor &background, const torch::Tensor &means3D,
+    const torch::Tensor &radii, const torch::Tensor &colors,
+    const torch::Tensor &scales, const torch::Tensor &rotations,
+    const float scale_modifier, const torch::Tensor &cov3D_precomp,
+    const torch::Tensor &viewmatrix, const torch::Tensor &projmatrix,
+    const float tan_fovx, const float tan_fovy,
+    const torch::Tensor &dL_dout_color, const torch::Tensor &sh,
+    const int degree, const torch::Tensor &campos,
+    const torch::Tensor &geomBuffer, const int R,
+    const torch::Tensor &binningBuffer, const torch::Tensor &imageBuffer,
+    const bool debug) {
+  const int P = means3D.size(0);
+  const int H = dL_dout_color.size(1);
+  const int W = dL_dout_color.size(2);
+  int M = 0;
+  if (sh.size(0) != 0) {
+    M = sh.size(1);
+  }
+  torch::Tensor dL_dmeans3D = torch::zeros({P, 3}, means3D.options());
+  torch::Tensor dL_dmeans2D = torch::zeros({P, 3}, means3D.options());
+  torch::Tensor dL_dcolors = torch::zeros({P, NUM_CHANNELS}, means3D.options());
+  torch::Tensor dL_dconic = torch::zeros({P, 2, 2}, means3D.options());
+  torch::Tensor dL_dopacity = torch::zeros({P, 1}, means3D.options());
+  torch::Tensor dL_dcov3D = torch::zeros({P, 6}, means3D.options());
+  torch::Tensor dL_dsh = torch::zeros({P, M, 3}, means3D.options());
+  torch::Tensor dL_dscales = torch::zeros({P, 3}, means3D.options());
+  torch::Tensor dL_drotations = torch::zeros({P, 4}, means3D.options());
+  if (P != 0) {
+    CudaRasterizer::Rasterizer::backward(
+        P, degree, M, R, background.contiguous().data<float>(), W, H,
+        means3D.contiguous().data<float>(), sh.contiguous().data<float>(),
+        colors.contiguous().data<float>(), scales.data_ptr<float>(),
+        scale_modifier, rotations.data_ptr<float>(),
+        cov3D_precomp.contiguous().data<float>(),
+        viewmatrix.contiguous().data<float>(),
+        projmatrix.contiguous().data<float>(),
+        campos.contiguous().data<float>(), tan_fovx, tan_fovy,
+        radii.contiguous().data<int>(),
+        reinterpret_cast<char *>(geomBuffer.contiguous().data_ptr()),
+        reinterpret_cast<char *>(binningBuffer.contiguous().data_ptr()),
+        reinterpret_cast<char *>(imageBuffer.contiguous().data_ptr()),
+        dL_dout_color.contiguous().data<float>(),
+        dL_dmeans2D.contiguous().data<float>(),
+        dL_dconic.contiguous().data<float>(),
+        dL_dopacity.contiguous().data<float>(),
+        dL_dcolors.contiguous().data<float>(),
+        dL_dmeans3D.contiguous().data<float>(),
+        dL_dcov3D.contiguous().data<float>(), dL_dsh.contiguous().data<float>(),
+        dL_dscales.contiguous().data<float>(),
+        dL_drotations.contiguous().data<float>(), debug);
+  }
+  return std::make_tuple(dL_dmeans2D, dL_dcolors, dL_dopacity, dL_dmeans3D,
+                         dL_dcov3D, dL_dsh, dL_dscales, dL_drotations);
+}
+torch::Tensor markVisible(torch::Tensor &means3D, torch::Tensor &viewmatrix,
+                          torch::Tensor &projmatrix) {
+  const int P = means3D.size(0);
+  torch::Tensor present =
+      torch::full({P}, false, means3D.options().dtype(at::kBool));
+  if (P != 0) {
+    CudaRasterizer::Rasterizer::markVisible(
+        P, means3D.contiguous().data<float>(),
+        viewmatrix.contiguous().data<float>(),
+        projmatrix.contiguous().data<float>(),
+        present.contiguous().data<bool>());
+  }
+  return present;
+}

gaussiancity/extensions/diff_gaussian_rasterization/rasterize_points.h ADDED Viewed

	@@ -0,0 +1,46 @@

+/*
+ * Copyright (C) 2023, Inria
+ * GRAPHDECO research group, https://team.inria.fr/graphdeco
+ * All rights reserved.
+ *
+ * This software is free for non-commercial, research and evaluation use
+ * under the terms of the LICENSE.md file.
+ *
+ * For inquiries contact  george.drettakis@inria.fr
+ */
+#pragma once
+#include <cstdio>
+#include <string>
+#include <torch/extension.h>
+#include <tuple>
+std::tuple<int, torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor,
+           torch::Tensor>
+RasterizeGaussiansCUDA(
+    const torch::Tensor &background, const torch::Tensor &means3D,
+    const torch::Tensor &colors, const torch::Tensor &opacity,
+    const torch::Tensor &scales, const torch::Tensor &rotations,
+    const float scale_modifier, const torch::Tensor &cov3D_precomp,
+    const torch::Tensor &viewmatrix, const torch::Tensor &projmatrix,
+    const float tan_fovx, const float tan_fovy, const int image_height,
+    const int image_width, const torch::Tensor &sh, const int degree,
+    const torch::Tensor &campos, const bool prefiltered, const bool debug);
+std::tuple<torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor,
+           torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor>
+RasterizeGaussiansBackwardCUDA(
+    const torch::Tensor &background, const torch::Tensor &means3D,
+    const torch::Tensor &radii, const torch::Tensor &colors,
+    const torch::Tensor &scales, const torch::Tensor &rotations,
+    const float scale_modifier, const torch::Tensor &cov3D_precomp,
+    const torch::Tensor &viewmatrix, const torch::Tensor &projmatrix,
+    const float tan_fovx, const float tan_fovy,
+    const torch::Tensor &dL_dout_color, const torch::Tensor &sh,
+    const int degree, const torch::Tensor &campos,
+    const torch::Tensor &geomBuffer, const int R,
+    const torch::Tensor &binningBuffer, const torch::Tensor &imageBuffer,
+    const bool debug);
+torch::Tensor markVisible(torch::Tensor &means3D, torch::Tensor &viewmatrix,
+                          torch::Tensor &projmatrix);

gaussiancity/extensions/diff_gaussian_rasterization/setup.py ADDED Viewed

	@@ -0,0 +1,40 @@

+#
+# Copyright (C) 2023, Inria
+# GRAPHDECO research group, https://team.inria.fr/graphdeco
+# All rights reserved.
+#
+# This software is free for non-commercial, research and evaluation use
+# under the terms of the LICENSE.md file.
+#
+# For inquiries contact  george.drettakis@inria.fr
+#
+from setuptools import setup
+from torch.utils.cpp_extension import CUDAExtension, BuildExtension
+import os
+setup(
+    name="diff_gaussian_rasterization",
+    version="1.0.0",
+    ext_modules=[
+        CUDAExtension(
+            name="diff_gaussian_rasterization_ext",
+            sources=[
+                "cuda_rasterizer/rasterizer_impl.cu",
+                "cuda_rasterizer/forward.cu",
+                "cuda_rasterizer/backward.cu",
+                "rasterize_points.cu",
+                "bindings.cpp",
+            ],
+            extra_compile_args={
+                "nvcc": [
+                    "-I"
+                    + os.path.join(
+                        os.path.dirname(os.path.abspath(__file__)), "third_party/glm/"
+                    )
+                ]
+            },
+        )
+    ],
+    cmdclass={"build_ext": BuildExtension},
+)

gaussiancity/extensions/diff_gaussian_rasterization/third_party/glm ADDED Viewed

	@@ -0,0 +1 @@


1	+ Subproject commit 2d4c4b4dd31fde06cfffad7915c2b3006402322f

gaussiancity/extensions/diff_gaussian_rasterization/third_party/stbi_image_write.h ADDED Viewed

	@@ -0,0 +1,1724 @@

+/* stb_image_write - v1.16 - public domain - http://nothings.org/stb
+   writes out PNG/BMP/TGA/JPEG/HDR images to C stdio - Sean Barrett 2010-2015
+                                     no warranty implied; use at your own risk
+   Before #including,
+       #define STB_IMAGE_WRITE_IMPLEMENTATION
+   in the file that you want to have the implementation.
+   Will probably not work correctly with strict-aliasing optimizations.
+ABOUT:
+   This header file is a library for writing images to C stdio or a callback.
+   The PNG output is not optimal; it is 20-50% larger than the file
+   written by a decent optimizing implementation; though providing a custom
+   zlib compress function (see STBIW_ZLIB_COMPRESS) can mitigate that.
+   This library is designed for source code compactness and simplicity,
+   not optimal image file size or run-time performance.
+BUILDING:
+   You can #define STBIW_ASSERT(x) before the #include to avoid using assert.h.
+   You can #define STBIW_MALLOC(), STBIW_REALLOC(), and STBIW_FREE() to replace
+   malloc,realloc,free.
+   You can #define STBIW_MEMMOVE() to replace memmove()
+   You can #define STBIW_ZLIB_COMPRESS to use a custom zlib-style compress function
+   for PNG compression (instead of the builtin one), it must have the following signature:
+   unsigned char * my_compress(unsigned char *data, int data_len, int *out_len, int quality);
+   The returned data will be freed with STBIW_FREE() (free() by default),
+   so it must be heap allocated with STBIW_MALLOC() (malloc() by default),
+UNICODE:
+   If compiling for Windows and you wish to use Unicode filenames, compile
+   with
+       #define STBIW_WINDOWS_UTF8
+   and pass utf8-encoded filenames. Call stbiw_convert_wchar_to_utf8 to convert
+   Windows wchar_t filenames to utf8.
+USAGE:
+   There are five functions, one for each image file format:
+     int stbi_write_png(char const *filename, int w, int h, int comp, const void *data, int stride_in_bytes);
+     int stbi_write_bmp(char const *filename, int w, int h, int comp, const void *data);
+     int stbi_write_tga(char const *filename, int w, int h, int comp, const void *data);
+     int stbi_write_jpg(char const *filename, int w, int h, int comp, const void *data, int quality);
+     int stbi_write_hdr(char const *filename, int w, int h, int comp, const float *data);
+     void stbi_flip_vertically_on_write(int flag); // flag is non-zero to flip data vertically
+   There are also five equivalent functions that use an arbitrary write function. You are
+   expected to open/close your file-equivalent before and after calling these:
+     int stbi_write_png_to_func(stbi_write_func *func, void *context, int w, int h, int comp, const void  *data, int stride_in_bytes);
+     int stbi_write_bmp_to_func(stbi_write_func *func, void *context, int w, int h, int comp, const void  *data);
+     int stbi_write_tga_to_func(stbi_write_func *func, void *context, int w, int h, int comp, const void  *data);
+     int stbi_write_hdr_to_func(stbi_write_func *func, void *context, int w, int h, int comp, const float *data);
+     int stbi_write_jpg_to_func(stbi_write_func *func, void *context, int x, int y, int comp, const void *data, int quality);
+   where the callback is:
+      void stbi_write_func(void *context, void *data, int size);
+   You can configure it with these global variables:
+      int stbi_write_tga_with_rle;             // defaults to true; set to 0 to disable RLE
+      int stbi_write_png_compression_level;    // defaults to 8; set to higher for more compression
+      int stbi_write_force_png_filter;         // defaults to -1; set to 0..5 to force a filter mode
+   You can define STBI_WRITE_NO_STDIO to disable the file variant of these
+   functions, so the library will not use stdio.h at all. However, this will
+   also disable HDR writing, because it requires stdio for formatted output.
+   Each function returns 0 on failure and non-0 on success.
+   The functions create an image file defined by the parameters. The image
+   is a rectangle of pixels stored from left-to-right, top-to-bottom.
+   Each pixel contains 'comp' channels of data stored interleaved with 8-bits
+   per channel, in the following order: 1=Y, 2=YA, 3=RGB, 4=RGBA. (Y is
+   monochrome color.) The rectangle is 'w' pixels wide and 'h' pixels tall.
+   The *data pointer points to the first byte of the top-left-most pixel.
+   For PNG, "stride_in_bytes" is the distance in bytes from the first byte of
+   a row of pixels to the first byte of the next row of pixels.
+   PNG creates output files with the same number of components as the input.
+   The BMP format expands Y to RGB in the file format and does not
+   output alpha.
+   PNG supports writing rectangles of data even when the bytes storing rows of
+   data are not consecutive in memory (e.g. sub-rectangles of a larger image),
+   by supplying the stride between the beginning of adjacent rows. The other
+   formats do not. (Thus you cannot write a native-format BMP through the BMP
+   writer, both because it is in BGR order and because it may have padding
+   at the end of the line.)
+   PNG allows you to set the deflate compression level by setting the global
+   variable 'stbi_write_png_compression_level' (it defaults to 8).
+   HDR expects linear float data. Since the format is always 32-bit rgb(e)
+   data, alpha (if provided) is discarded, and for monochrome data it is
+   replicated across all three channels.
+   TGA supports RLE or non-RLE compressed data. To use non-RLE-compressed
+   data, set the global variable 'stbi_write_tga_with_rle' to 0.
+   JPEG does ignore alpha channels in input data; quality is between 1 and 100.
+   Higher quality looks better but results in a bigger image.
+   JPEG baseline (no JPEG progressive).
+CREDITS:
+   Sean Barrett           -    PNG/BMP/TGA
+   Baldur Karlsson        -    HDR
+   Jean-Sebastien Guay    -    TGA monochrome
+   Tim Kelsey             -    misc enhancements
+   Alan Hickman           -    TGA RLE
+   Emmanuel Julien        -    initial file IO callback implementation
+   Jon Olick              -    original jo_jpeg.cpp code
+   Daniel Gibson          -    integrate JPEG, allow external zlib
+   Aarni Koskela          -    allow choosing PNG filter
+   bugfixes:
+      github:Chribba
+      Guillaume Chereau
+      github:jry2
+      github:romigrou
+      Sergio Gonzalez
+      Jonas Karlsson
+      Filip Wasil
+      Thatcher Ulrich
+      github:poppolopoppo
+      Patrick Boettcher
+      github:xeekworx
+      Cap Petschulat
+      Simon Rodriguez
+      Ivan Tikhonov
+      github:ignotion
+      Adam Schackart
+      Andrew Kensler
+LICENSE
+  See end of file for license information.
+*/
+#ifndef INCLUDE_STB_IMAGE_WRITE_H
+#define INCLUDE_STB_IMAGE_WRITE_H
+#include <stdlib.h>
+// if STB_IMAGE_WRITE_STATIC causes problems, try defining STBIWDEF to 'inline' or 'static inline'
+#ifndef STBIWDEF
+#ifdef STB_IMAGE_WRITE_STATIC
+#define STBIWDEF  static
+#else
+#ifdef __cplusplus
+#define STBIWDEF  extern "C"
+#else
+#define STBIWDEF  extern
+#endif
+#endif
+#endif
+#ifndef STB_IMAGE_WRITE_STATIC  // C++ forbids static forward declarations
+STBIWDEF int stbi_write_tga_with_rle;
+STBIWDEF int stbi_write_png_compression_level;
+STBIWDEF int stbi_write_force_png_filter;
+#endif
+#ifndef STBI_WRITE_NO_STDIO
+STBIWDEF int stbi_write_png(char const *filename, int w, int h, int comp, const void  *data, int stride_in_bytes);
+STBIWDEF int stbi_write_bmp(char const *filename, int w, int h, int comp, const void  *data);
+STBIWDEF int stbi_write_tga(char const *filename, int w, int h, int comp, const void  *data);
+STBIWDEF int stbi_write_hdr(char const *filename, int w, int h, int comp, const float *data);
+STBIWDEF int stbi_write_jpg(char const *filename, int x, int y, int comp, const void  *data, int quality);
+#ifdef STBIW_WINDOWS_UTF8
+STBIWDEF int stbiw_convert_wchar_to_utf8(char *buffer, size_t bufferlen, const wchar_t* input);
+#endif
+#endif
+typedef void stbi_write_func(void *context, void *data, int size);
+STBIWDEF int stbi_write_png_to_func(stbi_write_func *func, void *context, int w, int h, int comp, const void  *data, int stride_in_bytes);
+STBIWDEF int stbi_write_bmp_to_func(stbi_write_func *func, void *context, int w, int h, int comp, const void  *data);
+STBIWDEF int stbi_write_tga_to_func(stbi_write_func *func, void *context, int w, int h, int comp, const void  *data);
+STBIWDEF int stbi_write_hdr_to_func(stbi_write_func *func, void *context, int w, int h, int comp, const float *data);
+STBIWDEF int stbi_write_jpg_to_func(stbi_write_func *func, void *context, int x, int y, int comp, const void  *data, int quality);
+STBIWDEF void stbi_flip_vertically_on_write(int flip_boolean);
+#endif//INCLUDE_STB_IMAGE_WRITE_H
+#ifdef STB_IMAGE_WRITE_IMPLEMENTATION
+#ifdef _WIN32
+   #ifndef _CRT_SECURE_NO_WARNINGS
+   #define _CRT_SECURE_NO_WARNINGS
+   #endif
+   #ifndef _CRT_NONSTDC_NO_DEPRECATE
+   #define _CRT_NONSTDC_NO_DEPRECATE
+   #endif
+#endif
+#ifndef STBI_WRITE_NO_STDIO
+#include <stdio.h>
+#endif // STBI_WRITE_NO_STDIO
+#include <stdarg.h>
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+#if defined(STBIW_MALLOC) && defined(STBIW_FREE) && (defined(STBIW_REALLOC) || defined(STBIW_REALLOC_SIZED))
+// ok
+#elif !defined(STBIW_MALLOC) && !defined(STBIW_FREE) && !defined(STBIW_REALLOC) && !defined(STBIW_REALLOC_SIZED)
+// ok
+#else
+#error "Must define all or none of STBIW_MALLOC, STBIW_FREE, and STBIW_REALLOC (or STBIW_REALLOC_SIZED)."
+#endif
+#ifndef STBIW_MALLOC
+#define STBIW_MALLOC(sz)        malloc(sz)
+#define STBIW_REALLOC(p,newsz)  realloc(p,newsz)
+#define STBIW_FREE(p)           free(p)
+#endif
+#ifndef STBIW_REALLOC_SIZED
+#define STBIW_REALLOC_SIZED(p,oldsz,newsz) STBIW_REALLOC(p,newsz)
+#endif
+#ifndef STBIW_MEMMOVE
+#define STBIW_MEMMOVE(a,b,sz) memmove(a,b,sz)
+#endif
+#ifndef STBIW_ASSERT
+#include <assert.h>
+#define STBIW_ASSERT(x) assert(x)
+#endif
+#define STBIW_UCHAR(x) (unsigned char) ((x) & 0xff)
+#ifdef STB_IMAGE_WRITE_STATIC
+static int stbi_write_png_compression_level = 8;
+static int stbi_write_tga_with_rle = 1;
+static int stbi_write_force_png_filter = -1;
+#else
+int stbi_write_png_compression_level = 8;
+int stbi_write_tga_with_rle = 1;
+int stbi_write_force_png_filter = -1;
+#endif
+static int stbi__flip_vertically_on_write = 0;
+STBIWDEF void stbi_flip_vertically_on_write(int flag)
+{
+   stbi__flip_vertically_on_write = flag;
+}
+typedef struct
+{
+   stbi_write_func *func;
+   void *context;
+   unsigned char buffer[64];
+   int buf_used;
+} stbi__write_context;
+// initialize a callback-based context
+static void stbi__start_write_callbacks(stbi__write_context *s, stbi_write_func *c, void *context)
+{
+   s->func    = c;
+   s->context = context;
+}
+#ifndef STBI_WRITE_NO_STDIO
+static void stbi__stdio_write(void *context, void *data, int size)
+{
+   fwrite(data,1,size,(FILE*) context);
+}
+#if defined(_WIN32) && defined(STBIW_WINDOWS_UTF8)
+#ifdef __cplusplus
+#define STBIW_EXTERN extern "C"
+#else
+#define STBIW_EXTERN extern
+#endif
+STBIW_EXTERN __declspec(dllimport) int __stdcall MultiByteToWideChar(unsigned int cp, unsigned long flags, const char *str, int cbmb, wchar_t *widestr, int cchwide);
+STBIW_EXTERN __declspec(dllimport) int __stdcall WideCharToMultiByte(unsigned int cp, unsigned long flags, const wchar_t *widestr, int cchwide, char *str, int cbmb, const char *defchar, int *used_default);
+STBIWDEF int stbiw_convert_wchar_to_utf8(char *buffer, size_t bufferlen, const wchar_t* input)
+{
+   return WideCharToMultiByte(65001 /* UTF8 */, 0, input, -1, buffer, (int) bufferlen, NULL, NULL);
+}
+#endif
+static FILE *stbiw__fopen(char const *filename, char const *mode)
+{
+   FILE *f;
+#if defined(_WIN32) && defined(STBIW_WINDOWS_UTF8)
+   wchar_t wMode[64];
+   wchar_t wFilename[1024];
+   if (0 == MultiByteToWideChar(65001 /* UTF8 */, 0, filename, -1, wFilename, sizeof(wFilename)/sizeof(*wFilename)))
+      return 0;
+   if (0 == MultiByteToWideChar(65001 /* UTF8 */, 0, mode, -1, wMode, sizeof(wMode)/sizeof(*wMode)))
+      return 0;
+#if defined(_MSC_VER) && _MSC_VER >= 1400
+   if (0 != _wfopen_s(&f, wFilename, wMode))
+      f = 0;
+#else
+   f = _wfopen(wFilename, wMode);
+#endif
+#elif defined(_MSC_VER) && _MSC_VER >= 1400
+   if (0 != fopen_s(&f, filename, mode))
+      f=0;
+#else
+   f = fopen(filename, mode);
+#endif
+   return f;
+}
+static int stbi__start_write_file(stbi__write_context *s, const char *filename)
+{
+   FILE *f = stbiw__fopen(filename, "wb");
+   stbi__start_write_callbacks(s, stbi__stdio_write, (void *) f);
+   return f != NULL;
+}
+static void stbi__end_write_file(stbi__write_context *s)
+{
+   fclose((FILE *)s->context);
+}
+#endif // !STBI_WRITE_NO_STDIO
+typedef unsigned int stbiw_uint32;
+typedef int stb_image_write_test[sizeof(stbiw_uint32)==4 ? 1 : -1];
+static void stbiw__writefv(stbi__write_context *s, const char *fmt, va_list v)
+{
+   while (*fmt) {
+      switch (*fmt++) {
+         case ' ': break;
+         case '1': { unsigned char x = STBIW_UCHAR(va_arg(v, int));
+                     s->func(s->context,&x,1);
+                     break; }
+         case '2': { int x = va_arg(v,int);
+                     unsigned char b[2];
+                     b[0] = STBIW_UCHAR(x);
+                     b[1] = STBIW_UCHAR(x>>8);
+                     s->func(s->context,b,2);
+                     break; }
+         case '4': { stbiw_uint32 x = va_arg(v,int);
+                     unsigned char b[4];
+                     b[0]=STBIW_UCHAR(x);
+                     b[1]=STBIW_UCHAR(x>>8);
+                     b[2]=STBIW_UCHAR(x>>16);
+                     b[3]=STBIW_UCHAR(x>>24);
+                     s->func(s->context,b,4);
+                     break; }
+         default:
+            STBIW_ASSERT(0);
+            return;
+      }
+   }
+}
+static void stbiw__writef(stbi__write_context *s, const char *fmt, ...)
+{
+   va_list v;
+   va_start(v, fmt);
+   stbiw__writefv(s, fmt, v);
+   va_end(v);
+}
+static void stbiw__write_flush(stbi__write_context *s)
+{
+   if (s->buf_used) {
+      s->func(s->context, &s->buffer, s->buf_used);
+      s->buf_used = 0;
+   }
+}
+static void stbiw__putc(stbi__write_context *s, unsigned char c)
+{
+   s->func(s->context, &c, 1);
+}
+static void stbiw__write1(stbi__write_context *s, unsigned char a)
+{
+   if ((size_t)s->buf_used + 1 > sizeof(s->buffer))
+      stbiw__write_flush(s);
+   s->buffer[s->buf_used++] = a;
+}
+static void stbiw__write3(stbi__write_context *s, unsigned char a, unsigned char b, unsigned char c)
+{
+   int n;
+   if ((size_t)s->buf_used + 3 > sizeof(s->buffer))
+      stbiw__write_flush(s);
+   n = s->buf_used;
+   s->buf_used = n+3;
+   s->buffer[n+0] = a;
+   s->buffer[n+1] = b;
+   s->buffer[n+2] = c;
+}
+static void stbiw__write_pixel(stbi__write_context *s, int rgb_dir, int comp, int write_alpha, int expand_mono, unsigned char *d)
+{
+   unsigned char bg[3] = { 255, 0, 255}, px[3];
+   int k;
+   if (write_alpha < 0)
+      stbiw__write1(s, d[comp - 1]);
+   switch (comp) {
+      case 2: // 2 pixels = mono + alpha, alpha is written separately, so same as 1-channel case
+      case 1:
+         if (expand_mono)
+            stbiw__write3(s, d[0], d[0], d[0]); // monochrome bmp
+         else
+            stbiw__write1(s, d[0]);  // monochrome TGA
+         break;
+      case 4:
+         if (!write_alpha) {
+            // composite against pink background
+            for (k = 0; k < 3; ++k)
+               px[k] = bg[k] + ((d[k] - bg[k]) * d[3]) / 255;
+            stbiw__write3(s, px[1 - rgb_dir], px[1], px[1 + rgb_dir]);
+            break;
+         }
+         /* FALLTHROUGH */
+      case 3:
+         stbiw__write3(s, d[1 - rgb_dir], d[1], d[1 + rgb_dir]);
+         break;
+   }
+   if (write_alpha > 0)
+      stbiw__write1(s, d[comp - 1]);
+}
+static void stbiw__write_pixels(stbi__write_context *s, int rgb_dir, int vdir, int x, int y, int comp, void *data, int write_alpha, int scanline_pad, int expand_mono)
+{
+   stbiw_uint32 zero = 0;
+   int i,j, j_end;
+   if (y <= 0)
+      return;
+   if (stbi__flip_vertically_on_write)
+      vdir *= -1;
+   if (vdir < 0) {
+      j_end = -1; j = y-1;
+   } else {
+      j_end =  y; j = 0;
+   }
+   for (; j != j_end; j += vdir) {
+      for (i=0; i < x; ++i) {
+         unsigned char *d = (unsigned char *) data + (j*x+i)*comp;
+         stbiw__write_pixel(s, rgb_dir, comp, write_alpha, expand_mono, d);
+      }
+      stbiw__write_flush(s);
+      s->func(s->context, &zero, scanline_pad);
+   }
+}
+static int stbiw__outfile(stbi__write_context *s, int rgb_dir, int vdir, int x, int y, int comp, int expand_mono, void *data, int alpha, int pad, const char *fmt, ...)
+{
+   if (y < 0 || x < 0) {
+      return 0;
+   } else {
+      va_list v;
+      va_start(v, fmt);
+      stbiw__writefv(s, fmt, v);
+      va_end(v);
+      stbiw__write_pixels(s,rgb_dir,vdir,x,y,comp,data,alpha,pad, expand_mono);
+      return 1;
+   }
+}
+static int stbi_write_bmp_core(stbi__write_context *s, int x, int y, int comp, const void *data)
+{
+   if (comp != 4) {
+      // write RGB bitmap
+      int pad = (-x*3) & 3;
+      return stbiw__outfile(s,-1,-1,x,y,comp,1,(void *) data,0,pad,
+              "11 4 22 4" "4 44 22 444444",
+              'B', 'M', 14+40+(x*3+pad)*y, 0,0, 14+40,  // file header
+               40, x,y, 1,24, 0,0,0,0,0,0);             // bitmap header
+   } else {
+      // RGBA bitmaps need a v4 header
+      // use BI_BITFIELDS mode with 32bpp and alpha mask
+      // (straight BI_RGB with alpha mask doesn't work in most readers)
+      return stbiw__outfile(s,-1,-1,x,y,comp,1,(void *)data,1,0,
+         "11 4 22 4" "4 44 22 444444 4444 4 444 444 444 444",
+         'B', 'M', 14+108+x*y*4, 0, 0, 14+108, // file header
+         108, x,y, 1,32, 3,0,0,0,0,0, 0xff0000,0xff00,0xff,0xff000000u, 0, 0,0,0, 0,0,0, 0,0,0, 0,0,0); // bitmap V4 header
+   }
+}
+STBIWDEF int stbi_write_bmp_to_func(stbi_write_func *func, void *context, int x, int y, int comp, const void *data)
+{
+   stbi__write_context s = { 0 };
+   stbi__start_write_callbacks(&s, func, context);
+   return stbi_write_bmp_core(&s, x, y, comp, data);
+}
+#ifndef STBI_WRITE_NO_STDIO
+STBIWDEF int stbi_write_bmp(char const *filename, int x, int y, int comp, const void *data)
+{
+   stbi__write_context s = { 0 };
+   if (stbi__start_write_file(&s,filename)) {
+      int r = stbi_write_bmp_core(&s, x, y, comp, data);
+      stbi__end_write_file(&s);
+      return r;
+   } else
+      return 0;
+}
+#endif //!STBI_WRITE_NO_STDIO
+static int stbi_write_tga_core(stbi__write_context *s, int x, int y, int comp, void *data)
+{
+   int has_alpha = (comp == 2 || comp == 4);
+   int colorbytes = has_alpha ? comp-1 : comp;
+   int format = colorbytes < 2 ? 3 : 2; // 3 color channels (RGB/RGBA) = 2, 1 color channel (Y/YA) = 3
+   if (y < 0 || x < 0)
+      return 0;
+   if (!stbi_write_tga_with_rle) {
+      return stbiw__outfile(s, -1, -1, x, y, comp, 0, (void *) data, has_alpha, 0,
+         "111 221 2222 11", 0, 0, format, 0, 0, 0, 0, 0, x, y, (colorbytes + has_alpha) * 8, has_alpha * 8);
+   } else {
+      int i,j,k;
+      int jend, jdir;
+      stbiw__writef(s, "111 221 2222 11", 0,0,format+8, 0,0,0, 0,0,x,y, (colorbytes + has_alpha) * 8, has_alpha * 8);
+      if (stbi__flip_vertically_on_write) {
+         j = 0;
+         jend = y;
+         jdir = 1;
+      } else {
+         j = y-1;
+         jend = -1;
+         jdir = -1;
+      }
+      for (; j != jend; j += jdir) {
+         unsigned char *row = (unsigned char *) data + j * x * comp;
+         int len;
+         for (i = 0; i < x; i += len) {
+            unsigned char *begin = row + i * comp;
+            int diff = 1;
+            len = 1;
+            if (i < x - 1) {
+               ++len;
+               diff = memcmp(begin, row + (i + 1) * comp, comp);
+               if (diff) {
+                  const unsigned char *prev = begin;
+                  for (k = i + 2; k < x && len < 128; ++k) {
+                     if (memcmp(prev, row + k * comp, comp)) {
+                        prev += comp;
+                        ++len;
+                     } else {
+                        --len;
+                        break;
+                     }
+                  }
+               } else {
+                  for (k = i + 2; k < x && len < 128; ++k) {
+                     if (!memcmp(begin, row + k * comp, comp)) {
+                        ++len;
+                     } else {
+                        break;
+                     }
+                  }
+               }
+            }
+            if (diff) {
+               unsigned char header = STBIW_UCHAR(len - 1);
+               stbiw__write1(s, header);
+               for (k = 0; k < len; ++k) {
+                  stbiw__write_pixel(s, -1, comp, has_alpha, 0, begin + k * comp);
+               }
+            } else {
+               unsigned char header = STBIW_UCHAR(len - 129);
+               stbiw__write1(s, header);
+               stbiw__write_pixel(s, -1, comp, has_alpha, 0, begin);
+            }
+         }
+      }
+      stbiw__write_flush(s);
+   }
+   return 1;
+}
+STBIWDEF int stbi_write_tga_to_func(stbi_write_func *func, void *context, int x, int y, int comp, const void *data)
+{
+   stbi__write_context s = { 0 };
+   stbi__start_write_callbacks(&s, func, context);
+   return stbi_write_tga_core(&s, x, y, comp, (void *) data);
+}
+#ifndef STBI_WRITE_NO_STDIO
+STBIWDEF int stbi_write_tga(char const *filename, int x, int y, int comp, const void *data)
+{
+   stbi__write_context s = { 0 };
+   if (stbi__start_write_file(&s,filename)) {
+      int r = stbi_write_tga_core(&s, x, y, comp, (void *) data);
+      stbi__end_write_file(&s);
+      return r;
+   } else
+      return 0;
+}
+#endif
+// *************************************************************************************************
+// Radiance RGBE HDR writer
+// by Baldur Karlsson
+#define stbiw__max(a, b)  ((a) > (b) ? (a) : (b))
+#ifndef STBI_WRITE_NO_STDIO
+static void stbiw__linear_to_rgbe(unsigned char *rgbe, float *linear)
+{
+   int exponent;
+   float maxcomp = stbiw__max(linear[0], stbiw__max(linear[1], linear[2]));
+   if (maxcomp < 1e-32f) {
+      rgbe[0] = rgbe[1] = rgbe[2] = rgbe[3] = 0;
+   } else {
+      float normalize = (float) frexp(maxcomp, &exponent) * 256.0f/maxcomp;
+      rgbe[0] = (unsigned char)(linear[0] * normalize);
+      rgbe[1] = (unsigned char)(linear[1] * normalize);
+      rgbe[2] = (unsigned char)(linear[2] * normalize);
+      rgbe[3] = (unsigned char)(exponent + 128);
+   }
+}
+static void stbiw__write_run_data(stbi__write_context *s, int length, unsigned char databyte)
+{
+   unsigned char lengthbyte = STBIW_UCHAR(length+128);
+   STBIW_ASSERT(length+128 <= 255);
+   s->func(s->context, &lengthbyte, 1);
+   s->func(s->context, &databyte, 1);
+}
+static void stbiw__write_dump_data(stbi__write_context *s, int length, unsigned char *data)
+{
+   unsigned char lengthbyte = STBIW_UCHAR(length);
+   STBIW_ASSERT(length <= 128); // inconsistent with spec but consistent with official code
+   s->func(s->context, &lengthbyte, 1);
+   s->func(s->context, data, length);
+}
+static void stbiw__write_hdr_scanline(stbi__write_context *s, int width, int ncomp, unsigned char *scratch, float *scanline)
+{
+   unsigned char scanlineheader[4] = { 2, 2, 0, 0 };
+   unsigned char rgbe[4];
+   float linear[3];
+   int x;
+   scanlineheader[2] = (width&0xff00)>>8;
+   scanlineheader[3] = (width&0x00ff);
+   /* skip RLE for images too small or large */
+   if (width < 8 || width >= 32768) {
+      for (x=0; x < width; x++) {
+         switch (ncomp) {
+            case 4: /* fallthrough */
+            case 3: linear[2] = scanline[x*ncomp + 2];
+                    linear[1] = scanline[x*ncomp + 1];
+                    linear[0] = scanline[x*ncomp + 0];
+                    break;
+            default:
+                    linear[0] = linear[1] = linear[2] = scanline[x*ncomp + 0];
+                    break;
+         }
+         stbiw__linear_to_rgbe(rgbe, linear);
+         s->func(s->context, rgbe, 4);
+      }
+   } else {
+      int c,r;
+      /* encode into scratch buffer */
+      for (x=0; x < width; x++) {
+         switch(ncomp) {
+            case 4: /* fallthrough */
+            case 3: linear[2] = scanline[x*ncomp + 2];
+                    linear[1] = scanline[x*ncomp + 1];
+                    linear[0] = scanline[x*ncomp + 0];
+                    break;
+            default:
+                    linear[0] = linear[1] = linear[2] = scanline[x*ncomp + 0];
+                    break;
+         }
+         stbiw__linear_to_rgbe(rgbe, linear);
+         scratch[x + width*0] = rgbe[0];
+         scratch[x + width*1] = rgbe[1];
+         scratch[x + width*2] = rgbe[2];
+         scratch[x + width*3] = rgbe[3];
+      }
+      s->func(s->context, scanlineheader, 4);
+      /* RLE each component separately */
+      for (c=0; c < 4; c++) {
+         unsigned char *comp = &scratch[width*c];
+         x = 0;
+         while (x < width) {
+            // find first run
+            r = x;
+            while (r+2 < width) {
+               if (comp[r] == comp[r+1] && comp[r] == comp[r+2])
+                  break;
+               ++r;
+            }
+            if (r+2 >= width)
+               r = width;
+            // dump up to first run
+            while (x < r) {
+               int len = r-x;
+               if (len > 128) len = 128;
+               stbiw__write_dump_data(s, len, &comp[x]);
+               x += len;
+            }
+            // if there's a run, output it
+            if (r+2 < width) { // same test as what we break out of in search loop, so only true if we break'd
+               // find next byte after run
+               while (r < width && comp[r] == comp[x])
+                  ++r;
+               // output run up to r
+               while (x < r) {
+                  int len = r-x;
+                  if (len > 127) len = 127;
+                  stbiw__write_run_data(s, len, comp[x]);
+                  x += len;
+               }
+            }
+         }
+      }
+   }
+}
+static int stbi_write_hdr_core(stbi__write_context *s, int x, int y, int comp, float *data)
+{
+   if (y <= 0 || x <= 0 || data == NULL)
+      return 0;
+   else {
+      // Each component is stored separately. Allocate scratch space for full output scanline.
+      unsigned char *scratch = (unsigned char *) STBIW_MALLOC(x*4);
+      int i, len;
+      char buffer[128];
+      char header[] = "#?RADIANCE\n# Written by stb_image_write.h\nFORMAT=32-bit_rle_rgbe\n";
+      s->func(s->context, header, sizeof(header)-1);
+#ifdef __STDC_LIB_EXT1__
+      len = sprintf_s(buffer, sizeof(buffer), "EXPOSURE=          1.0000000000000\n\n-Y %d +X %d\n", y, x);
+#else
+      len = sprintf(buffer, "EXPOSURE=          1.0000000000000\n\n-Y %d +X %d\n", y, x);
+#endif
+      s->func(s->context, buffer, len);
+      for(i=0; i < y; i++)
+         stbiw__write_hdr_scanline(s, x, comp, scratch, data + comp*x*(stbi__flip_vertically_on_write ? y-1-i : i));
+      STBIW_FREE(scratch);
+      return 1;
+   }
+}
+STBIWDEF int stbi_write_hdr_to_func(stbi_write_func *func, void *context, int x, int y, int comp, const float *data)
+{
+   stbi__write_context s = { 0 };
+   stbi__start_write_callbacks(&s, func, context);
+   return stbi_write_hdr_core(&s, x, y, comp, (float *) data);
+}
+STBIWDEF int stbi_write_hdr(char const *filename, int x, int y, int comp, const float *data)
+{
+   stbi__write_context s = { 0 };
+   if (stbi__start_write_file(&s,filename)) {
+      int r = stbi_write_hdr_core(&s, x, y, comp, (float *) data);
+      stbi__end_write_file(&s);
+      return r;
+   } else
+      return 0;
+}
+#endif // STBI_WRITE_NO_STDIO
+//////////////////////////////////////////////////////////////////////////////
+//
+// PNG writer
+//
+#ifndef STBIW_ZLIB_COMPRESS
+// stretchy buffer; stbiw__sbpush() == vector<>::push_back() -- stbiw__sbcount() == vector<>::size()
+#define stbiw__sbraw(a) ((int *) (void *) (a) - 2)
+#define stbiw__sbm(a)   stbiw__sbraw(a)[0]
+#define stbiw__sbn(a)   stbiw__sbraw(a)[1]
+#define stbiw__sbneedgrow(a,n)  ((a)==0 || stbiw__sbn(a)+n >= stbiw__sbm(a))
+#define stbiw__sbmaybegrow(a,n) (stbiw__sbneedgrow(a,(n)) ? stbiw__sbgrow(a,n) : 0)
+#define stbiw__sbgrow(a,n)  stbiw__sbgrowf((void **) &(a), (n), sizeof(*(a)))
+#define stbiw__sbpush(a, v)      (stbiw__sbmaybegrow(a,1), (a)[stbiw__sbn(a)++] = (v))
+#define stbiw__sbcount(a)        ((a) ? stbiw__sbn(a) : 0)
+#define stbiw__sbfree(a)         ((a) ? STBIW_FREE(stbiw__sbraw(a)),0 : 0)
+static void *stbiw__sbgrowf(void **arr, int increment, int itemsize)
+{
+   int m = *arr ? 2*stbiw__sbm(*arr)+increment : increment+1;
+   void *p = STBIW_REALLOC_SIZED(*arr ? stbiw__sbraw(*arr) : 0, *arr ? (stbiw__sbm(*arr)*itemsize + sizeof(int)*2) : 0, itemsize * m + sizeof(int)*2);
+   STBIW_ASSERT(p);
+   if (p) {
+      if (!*arr) ((int *) p)[1] = 0;
+      *arr = (void *) ((int *) p + 2);
+      stbiw__sbm(*arr) = m;
+   }
+   return *arr;
+}
+static unsigned char *stbiw__zlib_flushf(unsigned char *data, unsigned int *bitbuffer, int *bitcount)
+{
+   while (*bitcount >= 8) {
+      stbiw__sbpush(data, STBIW_UCHAR(*bitbuffer));
+      *bitbuffer >>= 8;
+      *bitcount -= 8;
+   }
+   return data;
+}
+static int stbiw__zlib_bitrev(int code, int codebits)
+{
+   int res=0;
+   while (codebits--) {
+      res = (res << 1) | (code & 1);
+      code >>= 1;
+   }
+   return res;
+}
+static unsigned int stbiw__zlib_countm(unsigned char *a, unsigned char *b, int limit)
+{
+   int i;
+   for (i=0; i < limit && i < 258; ++i)
+      if (a[i] != b[i]) break;
+   return i;
+}
+static unsigned int stbiw__zhash(unsigned char *data)
+{
+   stbiw_uint32 hash = data[0] + (data[1] << 8) + (data[2] << 16);
+   hash ^= hash << 3;
+   hash += hash >> 5;
+   hash ^= hash << 4;
+   hash += hash >> 17;
+   hash ^= hash << 25;
+   hash += hash >> 6;
+   return hash;
+}
+#define stbiw__zlib_flush() (out = stbiw__zlib_flushf(out, &bitbuf, &bitcount))
+#define stbiw__zlib_add(code,codebits) \
+      (bitbuf |= (code) << bitcount, bitcount += (codebits), stbiw__zlib_flush())
+#define stbiw__zlib_huffa(b,c)  stbiw__zlib_add(stbiw__zlib_bitrev(b,c),c)
+// default huffman tables
+#define stbiw__zlib_huff1(n)  stbiw__zlib_huffa(0x30 + (n), 8)
+#define stbiw__zlib_huff2(n)  stbiw__zlib_huffa(0x190 + (n)-144, 9)
+#define stbiw__zlib_huff3(n)  stbiw__zlib_huffa(0 + (n)-256,7)
+#define stbiw__zlib_huff4(n)  stbiw__zlib_huffa(0xc0 + (n)-280,8)
+#define stbiw__zlib_huff(n)  ((n) <= 143 ? stbiw__zlib_huff1(n) : (n) <= 255 ? stbiw__zlib_huff2(n) : (n) <= 279 ? stbiw__zlib_huff3(n) : stbiw__zlib_huff4(n))
+#define stbiw__zlib_huffb(n) ((n) <= 143 ? stbiw__zlib_huff1(n) : stbiw__zlib_huff2(n))
+#define stbiw__ZHASH   16384
+#endif // STBIW_ZLIB_COMPRESS
+STBIWDEF unsigned char * stbi_zlib_compress(unsigned char *data, int data_len, int *out_len, int quality)
+{
+#ifdef STBIW_ZLIB_COMPRESS
+   // user provided a zlib compress implementation, use that
+   return STBIW_ZLIB_COMPRESS(data, data_len, out_len, quality);
+#else // use builtin
+   static unsigned short lengthc[] = { 3,4,5,6,7,8,9,10,11,13,15,17,19,23,27,31,35,43,51,59,67,83,99,115,131,163,195,227,258, 259 };
+   static unsigned char  lengtheb[]= { 0,0,0,0,0,0,0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4,  4,  5,  5,  5,  5,  0 };
+   static unsigned short distc[]   = { 1,2,3,4,5,7,9,13,17,25,33,49,65,97,129,193,257,385,513,769,1025,1537,2049,3073,4097,6145,8193,12289,16385,24577, 32768 };
+   static unsigned char  disteb[]  = { 0,0,0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13 };
+   unsigned int bitbuf=0;
+   int i,j, bitcount=0;
+   unsigned char *out = NULL;
+   unsigned char ***hash_table = (unsigned char***) STBIW_MALLOC(stbiw__ZHASH * sizeof(unsigned char**));
+   if (hash_table == NULL)
+      return NULL;
+   if (quality < 5) quality = 5;
+   stbiw__sbpush(out, 0x78);   // DEFLATE 32K window
+   stbiw__sbpush(out, 0x5e);   // FLEVEL = 1
+   stbiw__zlib_add(1,1);  // BFINAL = 1
+   stbiw__zlib_add(1,2);  // BTYPE = 1 -- fixed huffman
+   for (i=0; i < stbiw__ZHASH; ++i)
+      hash_table[i] = NULL;
+   i=0;
+   while (i < data_len-3) {
+      // hash next 3 bytes of data to be compressed
+      int h = stbiw__zhash(data+i)&(stbiw__ZHASH-1), best=3;
+      unsigned char *bestloc = 0;
+      unsigned char **hlist = hash_table[h];
+      int n = stbiw__sbcount(hlist);
+      for (j=0; j < n; ++j) {
+         if (hlist[j]-data > i-32768) { // if entry lies within window
+            int d = stbiw__zlib_countm(hlist[j], data+i, data_len-i);
+            if (d >= best) { best=d; bestloc=hlist[j]; }
+         }
+      }
+      // when hash table entry is too long, delete half the entries
+      if (hash_table[h] && stbiw__sbn(hash_table[h]) == 2*quality) {
+         STBIW_MEMMOVE(hash_table[h], hash_table[h]+quality, sizeof(hash_table[h][0])*quality);
+         stbiw__sbn(hash_table[h]) = quality;
+      }
+      stbiw__sbpush(hash_table[h],data+i);
+      if (bestloc) {
+         // "lazy matching" - check match at *next* byte, and if it's better, do cur byte as literal
+         h = stbiw__zhash(data+i+1)&(stbiw__ZHASH-1);
+         hlist = hash_table[h];
+         n = stbiw__sbcount(hlist);
+         for (j=0; j < n; ++j) {
+            if (hlist[j]-data > i-32767) {
+               int e = stbiw__zlib_countm(hlist[j], data+i+1, data_len-i-1);
+               if (e > best) { // if next match is better, bail on current match
+                  bestloc = NULL;
+                  break;
+               }
+            }
+         }
+      }
+      if (bestloc) {
+         int d = (int) (data+i - bestloc); // distance back
+         STBIW_ASSERT(d <= 32767 && best <= 258);
+         for (j=0; best > lengthc[j+1]-1; ++j);
+         stbiw__zlib_huff(j+257);
+         if (lengtheb[j]) stbiw__zlib_add(best - lengthc[j], lengtheb[j]);
+         for (j=0; d > distc[j+1]-1; ++j);
+         stbiw__zlib_add(stbiw__zlib_bitrev(j,5),5);
+         if (disteb[j]) stbiw__zlib_add(d - distc[j], disteb[j]);
+         i += best;
+      } else {
+         stbiw__zlib_huffb(data[i]);
+         ++i;
+      }
+   }
+   // write out final bytes
+   for (;i < data_len; ++i)
+      stbiw__zlib_huffb(data[i]);
+   stbiw__zlib_huff(256); // end of block
+   // pad with 0 bits to byte boundary
+   while (bitcount)
+      stbiw__zlib_add(0,1);
+   for (i=0; i < stbiw__ZHASH; ++i)
+      (void) stbiw__sbfree(hash_table[i]);
+   STBIW_FREE(hash_table);
+   // store uncompressed instead if compression was worse
+   if (stbiw__sbn(out) > data_len + 2 + ((data_len+32766)/32767)*5) {
+      stbiw__sbn(out) = 2;  // truncate to DEFLATE 32K window and FLEVEL = 1
+      for (j = 0; j < data_len;) {
+         int blocklen = data_len - j;
+         if (blocklen > 32767) blocklen = 32767;
+         stbiw__sbpush(out, data_len - j == blocklen); // BFINAL = ?, BTYPE = 0 -- no compression
+         stbiw__sbpush(out, STBIW_UCHAR(blocklen)); // LEN
+         stbiw__sbpush(out, STBIW_UCHAR(blocklen >> 8));
+         stbiw__sbpush(out, STBIW_UCHAR(~blocklen)); // NLEN
+         stbiw__sbpush(out, STBIW_UCHAR(~blocklen >> 8));
+         memcpy(out+stbiw__sbn(out), data+j, blocklen);
+         stbiw__sbn(out) += blocklen;
+         j += blocklen;
+      }
+   }
+   {
+      // compute adler32 on input
+      unsigned int s1=1, s2=0;
+      int blocklen = (int) (data_len % 5552);
+      j=0;
+      while (j < data_len) {
+         for (i=0; i < blocklen; ++i) { s1 += data[j+i]; s2 += s1; }
+         s1 %= 65521; s2 %= 65521;
+         j += blocklen;
+         blocklen = 5552;
+      }
+      stbiw__sbpush(out, STBIW_UCHAR(s2 >> 8));
+      stbiw__sbpush(out, STBIW_UCHAR(s2));
+      stbiw__sbpush(out, STBIW_UCHAR(s1 >> 8));
+      stbiw__sbpush(out, STBIW_UCHAR(s1));
+   }
+   *out_len = stbiw__sbn(out);
+   // make returned pointer freeable
+   STBIW_MEMMOVE(stbiw__sbraw(out), out, *out_len);
+   return (unsigned char *) stbiw__sbraw(out);
+#endif // STBIW_ZLIB_COMPRESS
+}
+static unsigned int stbiw__crc32(unsigned char *buffer, int len)
+{
+#ifdef STBIW_CRC32
+    return STBIW_CRC32(buffer, len);
+#else
+   static unsigned int crc_table[256] =
+   {
+      0x00000000, 0x77073096, 0xEE0E612C, 0x990951BA, 0x076DC419, 0x706AF48F, 0xE963A535, 0x9E6495A3,
+      0x0eDB8832, 0x79DCB8A4, 0xE0D5E91E, 0x97D2D988, 0x09B64C2B, 0x7EB17CBD, 0xE7B82D07, 0x90BF1D91,
+      0x1DB71064, 0x6AB020F2, 0xF3B97148, 0x84BE41DE, 0x1ADAD47D, 0x6DDDE4EB, 0xF4D4B551, 0x83D385C7,
+      0x136C9856, 0x646BA8C0, 0xFD62F97A, 0x8A65C9EC, 0x14015C4F, 0x63066CD9, 0xFA0F3D63, 0x8D080DF5,
+      0x3B6E20C8, 0x4C69105E, 0xD56041E4, 0xA2677172, 0x3C03E4D1, 0x4B04D447, 0xD20D85FD, 0xA50AB56B,
+      0x35B5A8FA, 0x42B2986C, 0xDBBBC9D6, 0xACBCF940, 0x32D86CE3, 0x45DF5C75, 0xDCD60DCF, 0xABD13D59,
+      0x26D930AC, 0x51DE003A, 0xC8D75180, 0xBFD06116, 0x21B4F4B5, 0x56B3C423, 0xCFBA9599, 0xB8BDA50F,
+      0x2802B89E, 0x5F058808, 0xC60CD9B2, 0xB10BE924, 0x2F6F7C87, 0x58684C11, 0xC1611DAB, 0xB6662D3D,
+      0x76DC4190, 0x01DB7106, 0x98D220BC, 0xEFD5102A, 0x71B18589, 0x06B6B51F, 0x9FBFE4A5, 0xE8B8D433,
+      0x7807C9A2, 0x0F00F934, 0x9609A88E, 0xE10E9818, 0x7F6A0DBB, 0x086D3D2D, 0x91646C97, 0xE6635C01,
+      0x6B6B51F4, 0x1C6C6162, 0x856530D8, 0xF262004E, 0x6C0695ED, 0x1B01A57B, 0x8208F4C1, 0xF50FC457,
+      0x65B0D9C6, 0x12B7E950, 0x8BBEB8EA, 0xFCB9887C, 0x62DD1DDF, 0x15DA2D49, 0x8CD37CF3, 0xFBD44C65,
+      0x4DB26158, 0x3AB551CE, 0xA3BC0074, 0xD4BB30E2, 0x4ADFA541, 0x3DD895D7, 0xA4D1C46D, 0xD3D6F4FB,
+      0x4369E96A, 0x346ED9FC, 0xAD678846, 0xDA60B8D0, 0x44042D73, 0x33031DE5, 0xAA0A4C5F, 0xDD0D7CC9,
+      0x5005713C, 0x270241AA, 0xBE0B1010, 0xC90C2086, 0x5768B525, 0x206F85B3, 0xB966D409, 0xCE61E49F,
+      0x5EDEF90E, 0x29D9C998, 0xB0D09822, 0xC7D7A8B4, 0x59B33D17, 0x2EB40D81, 0xB7BD5C3B, 0xC0BA6CAD,
+      0xEDB88320, 0x9ABFB3B6, 0x03B6E20C, 0x74B1D29A, 0xEAD54739, 0x9DD277AF, 0x04DB2615, 0x73DC1683,
+      0xE3630B12, 0x94643B84, 0x0D6D6A3E, 0x7A6A5AA8, 0xE40ECF0B, 0x9309FF9D, 0x0A00AE27, 0x7D079EB1,
+      0xF00F9344, 0x8708A3D2, 0x1E01F268, 0x6906C2FE, 0xF762575D, 0x806567CB, 0x196C3671, 0x6E6B06E7,
+      0xFED41B76, 0x89D32BE0, 0x10DA7A5A, 0x67DD4ACC, 0xF9B9DF6F, 0x8EBEEFF9, 0x17B7BE43, 0x60B08ED5,
+      0xD6D6A3E8, 0xA1D1937E, 0x38D8C2C4, 0x4FDFF252, 0xD1BB67F1, 0xA6BC5767, 0x3FB506DD, 0x48B2364B,
+      0xD80D2BDA, 0xAF0A1B4C, 0x36034AF6, 0x41047A60, 0xDF60EFC3, 0xA867DF55, 0x316E8EEF, 0x4669BE79,
+      0xCB61B38C, 0xBC66831A, 0x256FD2A0, 0x5268E236, 0xCC0C7795, 0xBB0B4703, 0x220216B9, 0x5505262F,
+      0xC5BA3BBE, 0xB2BD0B28, 0x2BB45A92, 0x5CB36A04, 0xC2D7FFA7, 0xB5D0CF31, 0x2CD99E8B, 0x5BDEAE1D,
+      0x9B64C2B0, 0xEC63F226, 0x756AA39C, 0x026D930A, 0x9C0906A9, 0xEB0E363F, 0x72076785, 0x05005713,
+      0x95BF4A82, 0xE2B87A14, 0x7BB12BAE, 0x0CB61B38, 0x92D28E9B, 0xE5D5BE0D, 0x7CDCEFB7, 0x0BDBDF21,
+      0x86D3D2D4, 0xF1D4E242, 0x68DDB3F8, 0x1FDA836E, 0x81BE16CD, 0xF6B9265B, 0x6FB077E1, 0x18B74777,
+      0x88085AE6, 0xFF0F6A70, 0x66063BCA, 0x11010B5C, 0x8F659EFF, 0xF862AE69, 0x616BFFD3, 0x166CCF45,
+      0xA00AE278, 0xD70DD2EE, 0x4E048354, 0x3903B3C2, 0xA7672661, 0xD06016F7, 0x4969474D, 0x3E6E77DB,
+      0xAED16A4A, 0xD9D65ADC, 0x40DF0B66, 0x37D83BF0, 0xA9BCAE53, 0xDEBB9EC5, 0x47B2CF7F, 0x30B5FFE9,
+      0xBDBDF21C, 0xCABAC28A, 0x53B39330, 0x24B4A3A6, 0xBAD03605, 0xCDD70693, 0x54DE5729, 0x23D967BF,
+      0xB3667A2E, 0xC4614AB8, 0x5D681B02, 0x2A6F2B94, 0xB40BBE37, 0xC30C8EA1, 0x5A05DF1B, 0x2D02EF8D
+   };
+   unsigned int crc = ~0u;
+   int i;
+   for (i=0; i < len; ++i)
+      crc = (crc >> 8) ^ crc_table[buffer[i] ^ (crc & 0xff)];
+   return ~crc;
+#endif
+}
+#define stbiw__wpng4(o,a,b,c,d) ((o)[0]=STBIW_UCHAR(a),(o)[1]=STBIW_UCHAR(b),(o)[2]=STBIW_UCHAR(c),(o)[3]=STBIW_UCHAR(d),(o)+=4)
+#define stbiw__wp32(data,v) stbiw__wpng4(data, (v)>>24,(v)>>16,(v)>>8,(v));
+#define stbiw__wptag(data,s) stbiw__wpng4(data, s[0],s[1],s[2],s[3])
+static void stbiw__wpcrc(unsigned char **data, int len)
+{
+   unsigned int crc = stbiw__crc32(*data - len - 4, len+4);
+   stbiw__wp32(*data, crc);
+}
+static unsigned char stbiw__paeth(int a, int b, int c)
+{
+   int p = a + b - c, pa = abs(p-a), pb = abs(p-b), pc = abs(p-c);
+   if (pa <= pb && pa <= pc) return STBIW_UCHAR(a);
+   if (pb <= pc) return STBIW_UCHAR(b);
+   return STBIW_UCHAR(c);
+}
+// @OPTIMIZE: provide an option that always forces left-predict or paeth predict
+static void stbiw__encode_png_line(unsigned char *pixels, int stride_bytes, int width, int height, int y, int n, int filter_type, signed char *line_buffer)
+{
+   static int mapping[] = { 0,1,2,3,4 };
+   static int firstmap[] = { 0,1,0,5,6 };
+   int *mymap = (y != 0) ? mapping : firstmap;
+   int i;
+   int type = mymap[filter_type];
+   unsigned char *z = pixels + stride_bytes * (stbi__flip_vertically_on_write ? height-1-y : y);
+   int signed_stride = stbi__flip_vertically_on_write ? -stride_bytes : stride_bytes;
+   if (type==0) {
+      memcpy(line_buffer, z, width*n);
+      return;
+   }
+   // first loop isn't optimized since it's just one pixel
+   for (i = 0; i < n; ++i) {
+      switch (type) {
+         case 1: line_buffer[i] = z[i]; break;
+         case 2: line_buffer[i] = z[i] - z[i-signed_stride]; break;
+         case 3: line_buffer[i] = z[i] - (z[i-signed_stride]>>1); break;
+         case 4: line_buffer[i] = (signed char) (z[i] - stbiw__paeth(0,z[i-signed_stride],0)); break;
+         case 5: line_buffer[i] = z[i]; break;
+         case 6: line_buffer[i] = z[i]; break;
+      }
+   }
+   switch (type) {
+      case 1: for (i=n; i < width*n; ++i) line_buffer[i] = z[i] - z[i-n]; break;
+      case 2: for (i=n; i < width*n; ++i) line_buffer[i] = z[i] - z[i-signed_stride]; break;
+      case 3: for (i=n; i < width*n; ++i) line_buffer[i] = z[i] - ((z[i-n] + z[i-signed_stride])>>1); break;
+      case 4: for (i=n; i < width*n; ++i) line_buffer[i] = z[i] - stbiw__paeth(z[i-n], z[i-signed_stride], z[i-signed_stride-n]); break;
+      case 5: for (i=n; i < width*n; ++i) line_buffer[i] = z[i] - (z[i-n]>>1); break;
+      case 6: for (i=n; i < width*n; ++i) line_buffer[i] = z[i] - stbiw__paeth(z[i-n], 0,0); break;
+   }
+}
+STBIWDEF unsigned char *stbi_write_png_to_mem(const unsigned char *pixels, int stride_bytes, int x, int y, int n, int *out_len)
+{
+   int force_filter = stbi_write_force_png_filter;
+   int ctype[5] = { -1, 0, 4, 2, 6 };
+   unsigned char sig[8] = { 137,80,78,71,13,10,26,10 };
+   unsigned char *out,*o, *filt, *zlib;
+   signed char *line_buffer;
+   int j,zlen;
+   if (stride_bytes == 0)
+      stride_bytes = x * n;
+   if (force_filter >= 5) {
+      force_filter = -1;
+   }
+   filt = (unsigned char *) STBIW_MALLOC((x*n+1) * y); if (!filt) return 0;
+   line_buffer = (signed char *) STBIW_MALLOC(x * n); if (!line_buffer) { STBIW_FREE(filt); return 0; }
+   for (j=0; j < y; ++j) {
+      int filter_type;
+      if (force_filter > -1) {
+         filter_type = force_filter;
+         stbiw__encode_png_line((unsigned char*)(pixels), stride_bytes, x, y, j, n, force_filter, line_buffer);
+      } else { // Estimate the best filter by running through all of them:
+         int best_filter = 0, best_filter_val = 0x7fffffff, est, i;
+         for (filter_type = 0; filter_type < 5; filter_type++) {
+            stbiw__encode_png_line((unsigned char*)(pixels), stride_bytes, x, y, j, n, filter_type, line_buffer);
+            // Estimate the entropy of the line using this filter; the less, the better.
+            est = 0;
+            for (i = 0; i < x*n; ++i) {
+               est += abs((signed char) line_buffer[i]);
+            }
+            if (est < best_filter_val) {
+               best_filter_val = est;
+               best_filter = filter_type;
+            }
+         }
+         if (filter_type != best_filter) {  // If the last iteration already got us the best filter, don't redo it
+            stbiw__encode_png_line((unsigned char*)(pixels), stride_bytes, x, y, j, n, best_filter, line_buffer);
+            filter_type = best_filter;
+         }
+      }
+      // when we get here, filter_type contains the filter type, and line_buffer contains the data
+      filt[j*(x*n+1)] = (unsigned char) filter_type;
+      STBIW_MEMMOVE(filt+j*(x*n+1)+1, line_buffer, x*n);
+   }
+   STBIW_FREE(line_buffer);
+   zlib = stbi_zlib_compress(filt, y*( x*n+1), &zlen, stbi_write_png_compression_level);
+   STBIW_FREE(filt);
+   if (!zlib) return 0;
+   // each tag requires 12 bytes of overhead
+   out = (unsigned char *) STBIW_MALLOC(8 + 12+13 + 12+zlen + 12);
+   if (!out) return 0;
+   *out_len = 8 + 12+13 + 12+zlen + 12;
+   o=out;
+   STBIW_MEMMOVE(o,sig,8); o+= 8;
+   stbiw__wp32(o, 13); // header length
+   stbiw__wptag(o, "IHDR");
+   stbiw__wp32(o, x);
+   stbiw__wp32(o, y);
+   *o++ = 8;
+   *o++ = STBIW_UCHAR(ctype[n]);
+   *o++ = 0;
+   *o++ = 0;
+   *o++ = 0;
+   stbiw__wpcrc(&o,13);
+   stbiw__wp32(o, zlen);
+   stbiw__wptag(o, "IDAT");
+   STBIW_MEMMOVE(o, zlib, zlen);
+   o += zlen;
+   STBIW_FREE(zlib);
+   stbiw__wpcrc(&o, zlen);
+   stbiw__wp32(o,0);
+   stbiw__wptag(o, "IEND");
+   stbiw__wpcrc(&o,0);
+   STBIW_ASSERT(o == out + *out_len);
+   return out;
+}
+#ifndef STBI_WRITE_NO_STDIO
+STBIWDEF int stbi_write_png(char const *filename, int x, int y, int comp, const void *data, int stride_bytes)
+{
+   FILE *f;
+   int len;
+   unsigned char *png = stbi_write_png_to_mem((const unsigned char *) data, stride_bytes, x, y, comp, &len);
+   if (png == NULL) return 0;
+   f = stbiw__fopen(filename, "wb");
+   if (!f) { STBIW_FREE(png); return 0; }
+   fwrite(png, 1, len, f);
+   fclose(f);
+   STBIW_FREE(png);
+   return 1;
+}
+#endif
+STBIWDEF int stbi_write_png_to_func(stbi_write_func *func, void *context, int x, int y, int comp, const void *data, int stride_bytes)
+{
+   int len;
+   unsigned char *png = stbi_write_png_to_mem((const unsigned char *) data, stride_bytes, x, y, comp, &len);
+   if (png == NULL) return 0;
+   func(context, png, len);
+   STBIW_FREE(png);
+   return 1;
+}
+/* ***************************************************************************
+ *
+ * JPEG writer
+ *
+ * This is based on Jon Olick's jo_jpeg.cpp:
+ * public domain Simple, Minimalistic JPEG writer - http://www.jonolick.com/code.html
+ */
+static const unsigned char stbiw__jpg_ZigZag[] = { 0,1,5,6,14,15,27,28,2,4,7,13,16,26,29,42,3,8,12,17,25,30,41,43,9,11,18,
+      24,31,40,44,53,10,19,23,32,39,45,52,54,20,22,33,38,46,51,55,60,21,34,37,47,50,56,59,61,35,36,48,49,57,58,62,63 };
+static void stbiw__jpg_writeBits(stbi__write_context *s, int *bitBufP, int *bitCntP, const unsigned short *bs) {
+   int bitBuf = *bitBufP, bitCnt = *bitCntP;
+   bitCnt += bs[1];
+   bitBuf |= bs[0] << (24 - bitCnt);
+   while(bitCnt >= 8) {
+      unsigned char c = (bitBuf >> 16) & 255;
+      stbiw__putc(s, c);
+      if(c == 255) {
+         stbiw__putc(s, 0);
+      }
+      bitBuf <<= 8;
+      bitCnt -= 8;
+   }
+   *bitBufP = bitBuf;
+   *bitCntP = bitCnt;
+}
+static void stbiw__jpg_DCT(float *d0p, float *d1p, float *d2p, float *d3p, float *d4p, float *d5p, float *d6p, float *d7p) {
+   float d0 = *d0p, d1 = *d1p, d2 = *d2p, d3 = *d3p, d4 = *d4p, d5 = *d5p, d6 = *d6p, d7 = *d7p;
+   float z1, z2, z3, z4, z5, z11, z13;
+   float tmp0 = d0 + d7;
+   float tmp7 = d0 - d7;
+   float tmp1 = d1 + d6;
+   float tmp6 = d1 - d6;
+   float tmp2 = d2 + d5;
+   float tmp5 = d2 - d5;
+   float tmp3 = d3 + d4;
+   float tmp4 = d3 - d4;
+   // Even part
+   float tmp10 = tmp0 + tmp3;   // phase 2
+   float tmp13 = tmp0 - tmp3;
+   float tmp11 = tmp1 + tmp2;
+   float tmp12 = tmp1 - tmp2;
+   d0 = tmp10 + tmp11;       // phase 3
+   d4 = tmp10 - tmp11;
+   z1 = (tmp12 + tmp13) * 0.707106781f; // c4
+   d2 = tmp13 + z1;       // phase 5
+   d6 = tmp13 - z1;
+   // Odd part
+   tmp10 = tmp4 + tmp5;       // phase 2
+   tmp11 = tmp5 + tmp6;
+   tmp12 = tmp6 + tmp7;
+   // The rotator is modified from fig 4-8 to avoid extra negations.
+   z5 = (tmp10 - tmp12) * 0.382683433f; // c6
+   z2 = tmp10 * 0.541196100f + z5; // c2-c6
+   z4 = tmp12 * 1.306562965f + z5; // c2+c6
+   z3 = tmp11 * 0.707106781f; // c4
+   z11 = tmp7 + z3;      // phase 5
+   z13 = tmp7 - z3;
+   *d5p = z13 + z2;         // phase 6
+   *d3p = z13 - z2;
+   *d1p = z11 + z4;
+   *d7p = z11 - z4;
+   *d0p = d0;  *d2p = d2;  *d4p = d4;  *d6p = d6;
+}
+static void stbiw__jpg_calcBits(int val, unsigned short bits[2]) {
+   int tmp1 = val < 0 ? -val : val;
+   val = val < 0 ? val-1 : val;
+   bits[1] = 1;
+   while(tmp1 >>= 1) {
+      ++bits[1];
+   }
+   bits[0] = val & ((1<<bits[1])-1);
+}
+static int stbiw__jpg_processDU(stbi__write_context *s, int *bitBuf, int *bitCnt, float *CDU, int du_stride, float *fdtbl, int DC, const unsigned short HTDC[256][2], const unsigned short HTAC[256][2]) {
+   const unsigned short EOB[2] = { HTAC[0x00][0], HTAC[0x00][1] };
+   const unsigned short M16zeroes[2] = { HTAC[0xF0][0], HTAC[0xF0][1] };
+   int dataOff, i, j, n, diff, end0pos, x, y;
+   int DU[64];
+   // DCT rows
+   for(dataOff=0, n=du_stride*8; dataOff<n; dataOff+=du_stride) {
+      stbiw__jpg_DCT(&CDU[dataOff], &CDU[dataOff+1], &CDU[dataOff+2], &CDU[dataOff+3], &CDU[dataOff+4], &CDU[dataOff+5], &CDU[dataOff+6], &CDU[dataOff+7]);
+   }
+   // DCT columns
+   for(dataOff=0; dataOff<8; ++dataOff) {
+      stbiw__jpg_DCT(&CDU[dataOff], &CDU[dataOff+du_stride], &CDU[dataOff+du_stride*2], &CDU[dataOff+du_stride*3], &CDU[dataOff+du_stride*4],
+                     &CDU[dataOff+du_stride*5], &CDU[dataOff+du_stride*6], &CDU[dataOff+du_stride*7]);
+   }
+   // Quantize/descale/zigzag the coefficients
+   for(y = 0, j=0; y < 8; ++y) {
+      for(x = 0; x < 8; ++x,++j) {
+         float v;
+         i = y*du_stride+x;
+         v = CDU[i]*fdtbl[j];
+         // DU[stbiw__jpg_ZigZag[j]] = (int)(v < 0 ? ceilf(v - 0.5f) : floorf(v + 0.5f));
+         // ceilf() and floorf() are C99, not C89, but I /think/ they're not needed here anyway?
+         DU[stbiw__jpg_ZigZag[j]] = (int)(v < 0 ? v - 0.5f : v + 0.5f);
+      }
+   }
+   // Encode DC
+   diff = DU[0] - DC;
+   if (diff == 0) {
+      stbiw__jpg_writeBits(s, bitBuf, bitCnt, HTDC[0]);
+   } else {
+      unsigned short bits[2];
+      stbiw__jpg_calcBits(diff, bits);
+      stbiw__jpg_writeBits(s, bitBuf, bitCnt, HTDC[bits[1]]);
+      stbiw__jpg_writeBits(s, bitBuf, bitCnt, bits);
+   }
+   // Encode ACs
+   end0pos = 63;
+   for(; (end0pos>0)&&(DU[end0pos]==0); --end0pos) {
+   }
+   // end0pos = first element in reverse order !=0
+   if(end0pos == 0) {
+      stbiw__jpg_writeBits(s, bitBuf, bitCnt, EOB);
+      return DU[0];
+   }
+   for(i = 1; i <= end0pos; ++i) {
+      int startpos = i;
+      int nrzeroes;
+      unsigned short bits[2];
+      for (; DU[i]==0 && i<=end0pos; ++i) {
+      }
+      nrzeroes = i-startpos;
+      if ( nrzeroes >= 16 ) {
+         int lng = nrzeroes>>4;
+         int nrmarker;
+         for (nrmarker=1; nrmarker <= lng; ++nrmarker)
+            stbiw__jpg_writeBits(s, bitBuf, bitCnt, M16zeroes);
+         nrzeroes &= 15;
+      }
+      stbiw__jpg_calcBits(DU[i], bits);
+      stbiw__jpg_writeBits(s, bitBuf, bitCnt, HTAC[(nrzeroes<<4)+bits[1]]);
+      stbiw__jpg_writeBits(s, bitBuf, bitCnt, bits);
+   }
+   if(end0pos != 63) {
+      stbiw__jpg_writeBits(s, bitBuf, bitCnt, EOB);
+   }
+   return DU[0];
+}
+static int stbi_write_jpg_core(stbi__write_context *s, int width, int height, int comp, const void* data, int quality) {
+   // Constants that don't pollute global namespace
+   static const unsigned char std_dc_luminance_nrcodes[] = {0,0,1,5,1,1,1,1,1,1,0,0,0,0,0,0,0};
+   static const unsigned char std_dc_luminance_values[] = {0,1,2,3,4,5,6,7,8,9,10,11};
+   static const unsigned char std_ac_luminance_nrcodes[] = {0,0,2,1,3,3,2,4,3,5,5,4,4,0,0,1,0x7d};
+   static const unsigned char std_ac_luminance_values[] = {
+      0x01,0x02,0x03,0x00,0x04,0x11,0x05,0x12,0x21,0x31,0x41,0x06,0x13,0x51,0x61,0x07,0x22,0x71,0x14,0x32,0x81,0x91,0xa1,0x08,
+      0x23,0x42,0xb1,0xc1,0x15,0x52,0xd1,0xf0,0x24,0x33,0x62,0x72,0x82,0x09,0x0a,0x16,0x17,0x18,0x19,0x1a,0x25,0x26,0x27,0x28,
+      0x29,0x2a,0x34,0x35,0x36,0x37,0x38,0x39,0x3a,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0x4a,0x53,0x54,0x55,0x56,0x57,0x58,0x59,
+      0x5a,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0x6a,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7a,0x83,0x84,0x85,0x86,0x87,0x88,0x89,
+      0x8a,0x92,0x93,0x94,0x95,0x96,0x97,0x98,0x99,0x9a,0xa2,0xa3,0xa4,0xa5,0xa6,0xa7,0xa8,0xa9,0xaa,0xb2,0xb3,0xb4,0xb5,0xb6,
+      0xb7,0xb8,0xb9,0xba,0xc2,0xc3,0xc4,0xc5,0xc6,0xc7,0xc8,0xc9,0xca,0xd2,0xd3,0xd4,0xd5,0xd6,0xd7,0xd8,0xd9,0xda,0xe1,0xe2,
+      0xe3,0xe4,0xe5,0xe6,0xe7,0xe8,0xe9,0xea,0xf1,0xf2,0xf3,0xf4,0xf5,0xf6,0xf7,0xf8,0xf9,0xfa
+   };
+   static const unsigned char std_dc_chrominance_nrcodes[] = {0,0,3,1,1,1,1,1,1,1,1,1,0,0,0,0,0};
+   static const unsigned char std_dc_chrominance_values[] = {0,1,2,3,4,5,6,7,8,9,10,11};
+   static const unsigned char std_ac_chrominance_nrcodes[] = {0,0,2,1,2,4,4,3,4,7,5,4,4,0,1,2,0x77};
+   static const unsigned char std_ac_chrominance_values[] = {
+      0x00,0x01,0x02,0x03,0x11,0x04,0x05,0x21,0x31,0x06,0x12,0x41,0x51,0x07,0x61,0x71,0x13,0x22,0x32,0x81,0x08,0x14,0x42,0x91,
+      0xa1,0xb1,0xc1,0x09,0x23,0x33,0x52,0xf0,0x15,0x62,0x72,0xd1,0x0a,0x16,0x24,0x34,0xe1,0x25,0xf1,0x17,0x18,0x19,0x1a,0x26,
+      0x27,0x28,0x29,0x2a,0x35,0x36,0x37,0x38,0x39,0x3a,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0x4a,0x53,0x54,0x55,0x56,0x57,0x58,
+      0x59,0x5a,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0x6a,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7a,0x82,0x83,0x84,0x85,0x86,0x87,
+      0x88,0x89,0x8a,0x92,0x93,0x94,0x95,0x96,0x97,0x98,0x99,0x9a,0xa2,0xa3,0xa4,0xa5,0xa6,0xa7,0xa8,0xa9,0xaa,0xb2,0xb3,0xb4,
+      0xb5,0xb6,0xb7,0xb8,0xb9,0xba,0xc2,0xc3,0xc4,0xc5,0xc6,0xc7,0xc8,0xc9,0xca,0xd2,0xd3,0xd4,0xd5,0xd6,0xd7,0xd8,0xd9,0xda,
+      0xe2,0xe3,0xe4,0xe5,0xe6,0xe7,0xe8,0xe9,0xea,0xf2,0xf3,0xf4,0xf5,0xf6,0xf7,0xf8,0xf9,0xfa
+   };
+   // Huffman tables
+   static const unsigned short YDC_HT[256][2] = { {0,2},{2,3},{3,3},{4,3},{5,3},{6,3},{14,4},{30,5},{62,6},{126,7},{254,8},{510,9}};
+   static const unsigned short UVDC_HT[256][2] = { {0,2},{1,2},{2,2},{6,3},{14,4},{30,5},{62,6},{126,7},{254,8},{510,9},{1022,10},{2046,11}};
+   static const unsigned short YAC_HT[256][2] = {
+      {10,4},{0,2},{1,2},{4,3},{11,4},{26,5},{120,7},{248,8},{1014,10},{65410,16},{65411,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {12,4},{27,5},{121,7},{502,9},{2038,11},{65412,16},{65413,16},{65414,16},{65415,16},{65416,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {28,5},{249,8},{1015,10},{4084,12},{65417,16},{65418,16},{65419,16},{65420,16},{65421,16},{65422,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {58,6},{503,9},{4085,12},{65423,16},{65424,16},{65425,16},{65426,16},{65427,16},{65428,16},{65429,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {59,6},{1016,10},{65430,16},{65431,16},{65432,16},{65433,16},{65434,16},{65435,16},{65436,16},{65437,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {122,7},{2039,11},{65438,16},{65439,16},{65440,16},{65441,16},{65442,16},{65443,16},{65444,16},{65445,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {123,7},{4086,12},{65446,16},{65447,16},{65448,16},{65449,16},{65450,16},{65451,16},{65452,16},{65453,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {250,8},{4087,12},{65454,16},{65455,16},{65456,16},{65457,16},{65458,16},{65459,16},{65460,16},{65461,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {504,9},{32704,15},{65462,16},{65463,16},{65464,16},{65465,16},{65466,16},{65467,16},{65468,16},{65469,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {505,9},{65470,16},{65471,16},{65472,16},{65473,16},{65474,16},{65475,16},{65476,16},{65477,16},{65478,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {506,9},{65479,16},{65480,16},{65481,16},{65482,16},{65483,16},{65484,16},{65485,16},{65486,16},{65487,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {1017,10},{65488,16},{65489,16},{65490,16},{65491,16},{65492,16},{65493,16},{65494,16},{65495,16},{65496,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {1018,10},{65497,16},{65498,16},{65499,16},{65500,16},{65501,16},{65502,16},{65503,16},{65504,16},{65505,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {2040,11},{65506,16},{65507,16},{65508,16},{65509,16},{65510,16},{65511,16},{65512,16},{65513,16},{65514,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {65515,16},{65516,16},{65517,16},{65518,16},{65519,16},{65520,16},{65521,16},{65522,16},{65523,16},{65524,16},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {2041,11},{65525,16},{65526,16},{65527,16},{65528,16},{65529,16},{65530,16},{65531,16},{65532,16},{65533,16},{65534,16},{0,0},{0,0},{0,0},{0,0},{0,0}
+   };
+   static const unsigned short UVAC_HT[256][2] = {
+      {0,2},{1,2},{4,3},{10,4},{24,5},{25,5},{56,6},{120,7},{500,9},{1014,10},{4084,12},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {11,4},{57,6},{246,8},{501,9},{2038,11},{4085,12},{65416,16},{65417,16},{65418,16},{65419,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {26,5},{247,8},{1015,10},{4086,12},{32706,15},{65420,16},{65421,16},{65422,16},{65423,16},{65424,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {27,5},{248,8},{1016,10},{4087,12},{65425,16},{65426,16},{65427,16},{65428,16},{65429,16},{65430,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {58,6},{502,9},{65431,16},{65432,16},{65433,16},{65434,16},{65435,16},{65436,16},{65437,16},{65438,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {59,6},{1017,10},{65439,16},{65440,16},{65441,16},{65442,16},{65443,16},{65444,16},{65445,16},{65446,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {121,7},{2039,11},{65447,16},{65448,16},{65449,16},{65450,16},{65451,16},{65452,16},{65453,16},{65454,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {122,7},{2040,11},{65455,16},{65456,16},{65457,16},{65458,16},{65459,16},{65460,16},{65461,16},{65462,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {249,8},{65463,16},{65464,16},{65465,16},{65466,16},{65467,16},{65468,16},{65469,16},{65470,16},{65471,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {503,9},{65472,16},{65473,16},{65474,16},{65475,16},{65476,16},{65477,16},{65478,16},{65479,16},{65480,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {504,9},{65481,16},{65482,16},{65483,16},{65484,16},{65485,16},{65486,16},{65487,16},{65488,16},{65489,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {505,9},{65490,16},{65491,16},{65492,16},{65493,16},{65494,16},{65495,16},{65496,16},{65497,16},{65498,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {506,9},{65499,16},{65500,16},{65501,16},{65502,16},{65503,16},{65504,16},{65505,16},{65506,16},{65507,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {2041,11},{65508,16},{65509,16},{65510,16},{65511,16},{65512,16},{65513,16},{65514,16},{65515,16},{65516,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {16352,14},{65517,16},{65518,16},{65519,16},{65520,16},{65521,16},{65522,16},{65523,16},{65524,16},{65525,16},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {1018,10},{32707,15},{65526,16},{65527,16},{65528,16},{65529,16},{65530,16},{65531,16},{65532,16},{65533,16},{65534,16},{0,0},{0,0},{0,0},{0,0},{0,0}
+   };
+   static const int YQT[] = {16,11,10,16,24,40,51,61,12,12,14,19,26,58,60,55,14,13,16,24,40,57,69,56,14,17,22,29,51,87,80,62,18,22,
+                             37,56,68,109,103,77,24,35,55,64,81,104,113,92,49,64,78,87,103,121,120,101,72,92,95,98,112,100,103,99};
+   static const int UVQT[] = {17,18,24,47,99,99,99,99,18,21,26,66,99,99,99,99,24,26,56,99,99,99,99,99,47,66,99,99,99,99,99,99,
+                              99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99};
+   static const float aasf[] = { 1.0f * 2.828427125f, 1.387039845f * 2.828427125f, 1.306562965f * 2.828427125f, 1.175875602f * 2.828427125f,
+                                 1.0f * 2.828427125f, 0.785694958f * 2.828427125f, 0.541196100f * 2.828427125f, 0.275899379f * 2.828427125f };
+   int row, col, i, k, subsample;
+   float fdtbl_Y[64], fdtbl_UV[64];
+   unsigned char YTable[64], UVTable[64];
+   if(!data || !width || !height || comp > 4 || comp < 1) {
+      return 0;
+   }
+   quality = quality ? quality : 90;
+   subsample = quality <= 90 ? 1 : 0;
+   quality = quality < 1 ? 1 : quality > 100 ? 100 : quality;
+   quality = quality < 50 ? 5000 / quality : 200 - quality * 2;
+   for(i = 0; i < 64; ++i) {
+      int uvti, yti = (YQT[i]*quality+50)/100;
+      YTable[stbiw__jpg_ZigZag[i]] = (unsigned char) (yti < 1 ? 1 : yti > 255 ? 255 : yti);
+      uvti = (UVQT[i]*quality+50)/100;
+      UVTable[stbiw__jpg_ZigZag[i]] = (unsigned char) (uvti < 1 ? 1 : uvti > 255 ? 255 : uvti);
+   }
+   for(row = 0, k = 0; row < 8; ++row) {
+      for(col = 0; col < 8; ++col, ++k) {
+         fdtbl_Y[k]  = 1 / (YTable [stbiw__jpg_ZigZag[k]] * aasf[row] * aasf[col]);
+         fdtbl_UV[k] = 1 / (UVTable[stbiw__jpg_ZigZag[k]] * aasf[row] * aasf[col]);
+      }
+   }
+   // Write Headers
+   {
+      static const unsigned char head0[] = { 0xFF,0xD8,0xFF,0xE0,0,0x10,'J','F','I','F',0,1,1,0,0,1,0,1,0,0,0xFF,0xDB,0,0x84,0 };
+      static const unsigned char head2[] = { 0xFF,0xDA,0,0xC,3,1,0,2,0x11,3,0x11,0,0x3F,0 };
+      const unsigned char head1[] = { 0xFF,0xC0,0,0x11,8,(unsigned char)(height>>8),STBIW_UCHAR(height),(unsigned char)(width>>8),STBIW_UCHAR(width),
+                                      3,1,(unsigned char)(subsample?0x22:0x11),0,2,0x11,1,3,0x11,1,0xFF,0xC4,0x01,0xA2,0 };
+      s->func(s->context, (void*)head0, sizeof(head0));
+      s->func(s->context, (void*)YTable, sizeof(YTable));
+      stbiw__putc(s, 1);
+      s->func(s->context, UVTable, sizeof(UVTable));
+      s->func(s->context, (void*)head1, sizeof(head1));
+      s->func(s->context, (void*)(std_dc_luminance_nrcodes+1), sizeof(std_dc_luminance_nrcodes)-1);
+      s->func(s->context, (void*)std_dc_luminance_values, sizeof(std_dc_luminance_values));
+      stbiw__putc(s, 0x10); // HTYACinfo
+      s->func(s->context, (void*)(std_ac_luminance_nrcodes+1), sizeof(std_ac_luminance_nrcodes)-1);
+      s->func(s->context, (void*)std_ac_luminance_values, sizeof(std_ac_luminance_values));
+      stbiw__putc(s, 1); // HTUDCinfo
+      s->func(s->context, (void*)(std_dc_chrominance_nrcodes+1), sizeof(std_dc_chrominance_nrcodes)-1);
+      s->func(s->context, (void*)std_dc_chrominance_values, sizeof(std_dc_chrominance_values));
+      stbiw__putc(s, 0x11); // HTUACinfo
+      s->func(s->context, (void*)(std_ac_chrominance_nrcodes+1), sizeof(std_ac_chrominance_nrcodes)-1);
+      s->func(s->context, (void*)std_ac_chrominance_values, sizeof(std_ac_chrominance_values));
+      s->func(s->context, (void*)head2, sizeof(head2));
+   }
+   // Encode 8x8 macroblocks
+   {
+      static const unsigned short fillBits[] = {0x7F, 7};
+      int DCY=0, DCU=0, DCV=0;
+      int bitBuf=0, bitCnt=0;
+      // comp == 2 is grey+alpha (alpha is ignored)
+      int ofsG = comp > 2 ? 1 : 0, ofsB = comp > 2 ? 2 : 0;
+      const unsigned char *dataR = (const unsigned char *)data;
+      const unsigned char *dataG = dataR + ofsG;
+      const unsigned char *dataB = dataR + ofsB;
+      int x, y, pos;
+      if(subsample) {
+         for(y = 0; y < height; y += 16) {
+            for(x = 0; x < width; x += 16) {
+               float Y[256], U[256], V[256];
+               for(row = y, pos = 0; row < y+16; ++row) {
+                  // row >= height => use last input row
+                  int clamped_row = (row < height) ? row : height - 1;
+                  int base_p = (stbi__flip_vertically_on_write ? (height-1-clamped_row) : clamped_row)*width*comp;
+                  for(col = x; col < x+16; ++col, ++pos) {
+                     // if col >= width => use pixel from last input column
+                     int p = base_p + ((col < width) ? col : (width-1))*comp;
+                     float r = dataR[p], g = dataG[p], b = dataB[p];
+                     Y[pos]= +0.29900f*r + 0.58700f*g + 0.11400f*b - 128;
+                     U[pos]= -0.16874f*r - 0.33126f*g + 0.50000f*b;
+                     V[pos]= +0.50000f*r - 0.41869f*g - 0.08131f*b;
+                  }
+               }
+               DCY = stbiw__jpg_processDU(s, &bitBuf, &bitCnt, Y+0,   16, fdtbl_Y, DCY, YDC_HT, YAC_HT);
+               DCY = stbiw__jpg_processDU(s, &bitBuf, &bitCnt, Y+8,   16, fdtbl_Y, DCY, YDC_HT, YAC_HT);
+               DCY = stbiw__jpg_processDU(s, &bitBuf, &bitCnt, Y+128, 16, fdtbl_Y, DCY, YDC_HT, YAC_HT);
+               DCY = stbiw__jpg_processDU(s, &bitBuf, &bitCnt, Y+136, 16, fdtbl_Y, DCY, YDC_HT, YAC_HT);
+               // subsample U,V
+               {
+                  float subU[64], subV[64];
+                  int yy, xx;
+                  for(yy = 0, pos = 0; yy < 8; ++yy) {
+                     for(xx = 0; xx < 8; ++xx, ++pos) {
+                        int j = yy*32+xx*2;
+                        subU[pos] = (U[j+0] + U[j+1] + U[j+16] + U[j+17]) * 0.25f;
+                        subV[pos] = (V[j+0] + V[j+1] + V[j+16] + V[j+17]) * 0.25f;
+                     }
+                  }
+                  DCU = stbiw__jpg_processDU(s, &bitBuf, &bitCnt, subU, 8, fdtbl_UV, DCU, UVDC_HT, UVAC_HT);
+                  DCV = stbiw__jpg_processDU(s, &bitBuf, &bitCnt, subV, 8, fdtbl_UV, DCV, UVDC_HT, UVAC_HT);
+               }
+            }
+         }
+      } else {
+         for(y = 0; y < height; y += 8) {
+            for(x = 0; x < width; x += 8) {
+               float Y[64], U[64], V[64];
+               for(row = y, pos = 0; row < y+8; ++row) {
+                  // row >= height => use last input row
+                  int clamped_row = (row < height) ? row : height - 1;
+                  int base_p = (stbi__flip_vertically_on_write ? (height-1-clamped_row) : clamped_row)*width*comp;
+                  for(col = x; col < x+8; ++col, ++pos) {
+                     // if col >= width => use pixel from last input column
+                     int p = base_p + ((col < width) ? col : (width-1))*comp;
+                     float r = dataR[p], g = dataG[p], b = dataB[p];
+                     Y[pos]= +0.29900f*r + 0.58700f*g + 0.11400f*b - 128;
+                     U[pos]= -0.16874f*r - 0.33126f*g + 0.50000f*b;
+                     V[pos]= +0.50000f*r - 0.41869f*g - 0.08131f*b;
+                  }
+               }
+               DCY = stbiw__jpg_processDU(s, &bitBuf, &bitCnt, Y, 8, fdtbl_Y,  DCY, YDC_HT, YAC_HT);
+               DCU = stbiw__jpg_processDU(s, &bitBuf, &bitCnt, U, 8, fdtbl_UV, DCU, UVDC_HT, UVAC_HT);
+               DCV = stbiw__jpg_processDU(s, &bitBuf, &bitCnt, V, 8, fdtbl_UV, DCV, UVDC_HT, UVAC_HT);
+            }
+         }
+      }
+      // Do the bit alignment of the EOI marker
+      stbiw__jpg_writeBits(s, &bitBuf, &bitCnt, fillBits);
+   }
+   // EOI
+   stbiw__putc(s, 0xFF);
+   stbiw__putc(s, 0xD9);
+   return 1;
+}
+STBIWDEF int stbi_write_jpg_to_func(stbi_write_func *func, void *context, int x, int y, int comp, const void *data, int quality)
+{
+   stbi__write_context s = { 0 };
+   stbi__start_write_callbacks(&s, func, context);
+   return stbi_write_jpg_core(&s, x, y, comp, (void *) data, quality);
+}
+#ifndef STBI_WRITE_NO_STDIO
+STBIWDEF int stbi_write_jpg(char const *filename, int x, int y, int comp, const void *data, int quality)
+{
+   stbi__write_context s = { 0 };
+   if (stbi__start_write_file(&s,filename)) {
+      int r = stbi_write_jpg_core(&s, x, y, comp, data, quality);
+      stbi__end_write_file(&s);
+      return r;
+   } else
+      return 0;
+}
+#endif
+#endif // STB_IMAGE_WRITE_IMPLEMENTATION
+/* Revision history
+      1.16  (2021-07-11)
+             make Deflate code emit uncompressed blocks when it would otherwise expand
+             support writing BMPs with alpha channel
+      1.15  (2020-07-13) unknown
+      1.14  (2020-02-02) updated JPEG writer to downsample chroma channels
+      1.13
+      1.12
+      1.11  (2019-08-11)
+      1.10  (2019-02-07)
+             support utf8 filenames in Windows; fix warnings and platform ifdefs
+      1.09  (2018-02-11)
+             fix typo in zlib quality API, improve STB_I_W_STATIC in C++
+      1.08  (2018-01-29)
+             add stbi__flip_vertically_on_write, external zlib, zlib quality, choose PNG filter
+      1.07  (2017-07-24)
+             doc fix
+      1.06 (2017-07-23)
+             writing JPEG (using Jon Olick's code)
+      1.05   ???
+      1.04 (2017-03-03)
+             monochrome BMP expansion
+      1.03   ???
+      1.02 (2016-04-02)
+             avoid allocating large structures on the stack
+      1.01 (2016-01-16)
+             STBIW_REALLOC_SIZED: support allocators with no realloc support
+             avoid race-condition in crc initialization
+             minor compile issues
+      1.00 (2015-09-14)
+             installable file IO function
+      0.99 (2015-09-13)
+             warning fixes; TGA rle support
+      0.98 (2015-04-08)
+             added STBIW_MALLOC, STBIW_ASSERT etc
+      0.97 (2015-01-18)
+             fixed HDR asserts, rewrote HDR rle logic
+      0.96 (2015-01-17)
+             add HDR output
+             fix monochrome BMP
+      0.95 (2014-08-17)
+             add monochrome TGA output
+      0.94 (2014-05-31)
+             rename private functions to avoid conflicts with stb_image.h
+      0.93 (2014-05-27)
+             warning fixes
+      0.92 (2010-08-01)
+             casts to unsigned char to fix warnings
+      0.91 (2010-07-17)
+             first public release
+      0.90   first internal release
+*/
+/*
+------------------------------------------------------------------------------
+This software is available under 2 licenses -- choose whichever you prefer.
+------------------------------------------------------------------------------
+ALTERNATIVE A - MIT License
+Copyright (c) 2017 Sean Barrett
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+------------------------------------------------------------------------------
+ALTERNATIVE B - Public Domain (www.unlicense.org)
+This is free and unencumbered software released into the public domain.
+Anyone is free to copy, modify, publish, use, compile, sell, or distribute this
+software, either in source code form or as a compiled binary, for any purpose,
+commercial or non-commercial, and by any means.
+In jurisdictions that recognize copyright laws, the author or authors of this
+software dedicate any and all copyright interest in the software to the public
+domain. We make this dedication for the benefit of the public at large and to
+the detriment of our heirs and successors. We intend this dedication to be an
+overt act of relinquishment in perpetuity of all present and future rights to
+this software under copyright law.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+------------------------------------------------------------------------------
+*/

gaussiancity/extensions/grid_encoder/__init__.py ADDED Viewed

	@@ -0,0 +1,193 @@

+# -*- coding: utf-8 -*-
+#
+# @File:   __init__.py
+# @Author: Jiaxiang Tang (@ashawkey)
+# @Date:   2023-04-15 10:39:28
+# @Last Modified by: Haozhe Xie
+# @Last Modified at: 2023-04-15 13:08:46
+# @Email:  ashawkey1999@gmail.com
+# @Ref: https://github.com/ashawkey/torch-ngp
+import math
+import numpy as np
+import torch
+import grid_encoder_ext
+class GridEncoderFunction(torch.autograd.Function):
+    @staticmethod
+    def forward(
+        ctx,
+        inputs,
+        embeddings,
+        offsets,
+        per_level_scale,
+        base_resolution,
+        calc_grad_inputs=False,
+        gridtype=0,
+        align_corners=False,
+    ):
+        # inputs: [B, D], float in [0, 1]
+        # embeddings: [sO, C], float
+        # offsets: [L + 1], int
+        # RETURN: [B, F], float
+        inputs = inputs.contiguous()
+        # batch size, coord dim
+        B, D = inputs.shape
+        # level
+        L = offsets.shape[0] - 1
+        # embedding dim for each level
+        C = embeddings.shape[1]
+        # resolution multiplier at each level, apply log2 for later CUDA exp2f
+        S = math.log2(per_level_scale)
+        # base resolution
+        H = base_resolution
+        # L first, optimize cache for cuda kernel, but needs an extra permute later
+        outputs = torch.empty(L, B, C, device=inputs.device, dtype=embeddings.dtype)
+        if calc_grad_inputs:
+            dy_dx = torch.empty(
+                B, L * D * C, device=inputs.device, dtype=embeddings.dtype
+            )
+        else:
+            dy_dx = torch.empty(
+                1, device=inputs.device, dtype=embeddings.dtype
+            )  # placeholder... TODO: a better way?
+        grid_encoder_ext.forward(
+            inputs,
+            embeddings,
+            offsets,
+            outputs,
+            B,
+            D,
+            C,
+            L,
+            S,
+            H,
+            calc_grad_inputs,
+            dy_dx,
+            gridtype,
+            align_corners,
+        )
+        # permute back to [B, L * C]
+        outputs = outputs.permute(1, 0, 2).reshape(B, L * C)
+        ctx.save_for_backward(inputs, embeddings, offsets, dy_dx)
+        ctx.dims = [B, D, C, L, S, H, gridtype]
+        ctx.calc_grad_inputs = calc_grad_inputs
+        ctx.align_corners = align_corners
+        return outputs
+    @staticmethod
+    def backward(ctx, grad):
+        inputs, embeddings, offsets, dy_dx = ctx.saved_tensors
+        B, D, C, L, S, H, gridtype = ctx.dims
+        calc_grad_inputs = ctx.calc_grad_inputs
+        align_corners = ctx.align_corners
+        # grad: [B, L * C] --> [L, B, C]
+        grad = grad.view(B, L, C).permute(1, 0, 2).contiguous()
+        grad_embeddings = torch.zeros_like(embeddings)
+        if calc_grad_inputs:
+            grad_inputs = torch.zeros_like(inputs, dtype=embeddings.dtype)
+        else:
+            grad_inputs = torch.zeros(1, device=inputs.device, dtype=embeddings.dtype)
+        grid_encoder_ext.backward(
+            grad,
+            inputs,
+            embeddings,
+            offsets,
+            grad_embeddings,
+            B,
+            D,
+            C,
+            L,
+            S,
+            H,
+            calc_grad_inputs,
+            dy_dx,
+            grad_inputs,
+            gridtype,
+            align_corners,
+        )
+        if calc_grad_inputs:
+            grad_inputs = grad_inputs.to(inputs.dtype)
+            return grad_inputs, grad_embeddings, None, None, None, None, None, None
+        else:
+            return None, grad_embeddings, None, None, None, None, None, None
+class GridEncoder(torch.nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        n_levels,
+        lvl_channels,
+        desired_resolution,
+        per_level_scale=2,
+        base_resolution=16,
+        log2_hashmap_size=19,
+        gridtype="hash",
+        align_corners=False,
+    ):
+        super(GridEncoder, self).__init__()
+        self.in_channels = in_channels
+        self.n_levels = n_levels  # num levels, each level multiply resolution by 2
+        self.lvl_channels = lvl_channels  # encode channels per level
+        self.per_level_scale = 2 ** (
+            math.log2(desired_resolution / base_resolution) / (n_levels - 1)
+        )
+        self.log2_hashmap_size = log2_hashmap_size
+        self.base_resolution = base_resolution
+        self.output_dim = n_levels * lvl_channels
+        self.gridtype = gridtype
+        self.gridtype_id = 0 if gridtype == "hash" else 1
+        self.align_corners = align_corners
+        # allocate parameters
+        offsets = []
+        offset = 0
+        self.max_params = 2**log2_hashmap_size
+        for i in range(n_levels):
+            resolution = int(math.ceil(base_resolution * per_level_scale**i))
+            params_in_level = min(
+                self.max_params,
+                (resolution if align_corners else resolution + 1) ** in_channels,
+            )  # limit max number
+            params_in_level = int(math.ceil(params_in_level / 8) * 8)  # make divisible
+            offsets.append(offset)
+            offset += params_in_level
+        offsets.append(offset)
+        offsets = torch.from_numpy(np.array(offsets, dtype=np.int32))
+        self.register_buffer("offsets", offsets)
+        self.n_params = offsets[-1] * lvl_channels
+        self.embeddings = torch.nn.Parameter(torch.empty(offset, lvl_channels))
+        self._init_weights()
+    def _init_weights(self):
+        self.embeddings.data.uniform_(-1e-4, 1e-4)
+    def forward(self, inputs, bound=1):
+        # inputs: [..., in_channels], normalized real world positions in [-bound, bound]
+        # return: [..., n_levels * lvl_channels]
+        inputs = (inputs + bound) / (2 * bound)  # map to [0, 1]
+        prefix_shape = list(inputs.shape[:-1])
+        inputs = inputs.view(-1, self.in_channels)
+        outputs = GridEncoderFunction.apply(
+            inputs,
+            self.embeddings,
+            self.offsets,
+            self.per_level_scale,
+            self.base_resolution,
+            inputs.requires_grad,
+            self.gridtype_id,
+            self.align_corners,
+        )
+        return outputs.view(prefix_shape + [self.output_dim])

gaussiancity/extensions/grid_encoder/bindings.cpp ADDED Viewed

	@@ -0,0 +1,40 @@

+/**
+ * @File:   grid_encoder_ext_cuda.cpp
+ * @Author: Jiaxiang Tang (@ashawkey)
+ * @Date:   2023-04-15 10:39:17
+ * @Last Modified by: Haozhe Xie
+ * @Last Modified at: 2023-04-15 11:01:32
+ * @Email:  ashawkey1999@gmail.com
+ * @Ref: https://github.com/ashawkey/torch-ngp
+ */
+#include <stdint.h>
+#include <torch/extension.h>
+#include <torch/torch.h>
+// inputs: [B, D], float, in [0, 1]
+// embeddings: [sO, C], float
+// offsets: [L + 1], uint32_t
+// outputs: [B, L * C], float
+// H: base resolution
+void grid_encode_forward(const at::Tensor inputs, const at::Tensor embeddings,
+                         const at::Tensor offsets, at::Tensor outputs,
+                         const uint32_t B, const uint32_t D, const uint32_t C,
+                         const uint32_t L, const float S, const uint32_t H,
+                         const bool calc_grad_inputs, at::Tensor dy_dx,
+                         const uint32_t gridtype, const bool align_corners);
+void grid_encode_backward(const at::Tensor grad, const at::Tensor inputs,
+                          const at::Tensor embeddings, const at::Tensor offsets,
+                          at::Tensor grad_embeddings, const uint32_t B,
+                          const uint32_t D, const uint32_t C, const uint32_t L,
+                          const float S, const uint32_t H,
+                          const bool calc_grad_inputs, const at::Tensor dy_dx,
+                          at::Tensor grad_inputs, const uint32_t gridtype,
+                          const bool align_corners);
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("forward", &grid_encode_forward,
+        "grid_encode_forward (CUDA)");
+  m.def("backward", &grid_encode_backward,
+        "grid_encode_backward (CUDA)");
+}

gaussiancity/extensions/grid_encoder/grid_encoder_ext.cu ADDED Viewed

	@@ -0,0 +1,605 @@

+/**
+ * @File:   grid_encoder_ext.cu
+ * @Author: Jiaxiang Tang (@ashawkey)
+ * @Date:   2023-04-15 10:43:16
+ * @Last Modified by: Haozhe Xie
+ * @Last Modified at: 2023-04-29 11:47:54
+ * @Email:  ashawkey1999@gmail.com
+ * @Ref: https://github.com/ashawkey/torch-ngp
+ */
+#include <cuda.h>
+#include <cuda_fp16.h>
+#include <cuda_runtime.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <torch/torch.h>
+#include <algorithm>
+#include <stdexcept>
+#include <cstdio>
+#include <stdint.h>
+#define CHECK_CUDA(x)                                                          \
+  TORCH_CHECK(x.device().is_cuda(), #x " must be a CUDA tensor")
+#define CHECK_CONTIGUOUS(x)                                                    \
+  TORCH_CHECK(x.is_contiguous(), #x " must be a contiguous tensor")
+#define CHECK_IS_INT(x)                                                        \
+  TORCH_CHECK(x.scalar_type() == at::ScalarType::Int,                          \
+              #x " must be an int tensor")
+#define CHECK_IS_FLOATING(x)                                                   \
+  TORCH_CHECK(x.scalar_type() == at::ScalarType::Float ||                      \
+                  x.scalar_type() == at::ScalarType::Half ||                   \
+                  x.scalar_type() == at::ScalarType::Double,                   \
+              #x " must be a floating tensor")
+// just for compatability of half precision in
+// AT_DISPATCH_FLOATING_TYPES_AND_HALF...
+static inline __device__ at::Half atomicAdd(at::Half *address, at::Half val) {
+  // requires CUDA >= 10 and ARCH >= 70
+  // this is very slow compared to float or __half2, and never used.
+  // return atomicAdd(reinterpret_cast<__half*>(address), val);
+}
+template <typename T>
+static inline __host__ __device__ T div_round_up(T val, T divisor) {
+  return (val + divisor - 1) / divisor;
+}
+template <uint32_t D>
+__device__ uint32_t fast_hash(const uint32_t pos_grid[D]) {
+  static_assert(D <= 7, "fast_hash can only hash up to 7 dimensions.");
+  // While 1 is technically not a good prime for hashing (or a prime at all), it
+  // helps memory coherence and is sufficient for our use case of obtaining a
+  // uniformly colliding index from high-dimensional coordinates.
+  constexpr uint32_t primes[7] = {1,          2654435761, 805459861, 3674653429,
+                                  2097192037, 1434869437, 2165219737};
+  uint32_t result = 0;
+#pragma unroll
+  for (uint32_t i = 0; i < D; ++i) {
+    result ^= pos_grid[i] * primes[i];
+  }
+  return result;
+}
+template <uint32_t D, uint32_t C>
+__device__ uint32_t get_grid_index(const uint32_t gridtype,
+                                   const bool align_corners, const uint32_t ch,
+                                   const uint32_t hashmap_size,
+                                   const uint32_t resolution,
+                                   const uint32_t pos_grid[D]) {
+  uint32_t stride = 1;
+  uint32_t index = 0;
+#pragma unroll
+  for (uint32_t d = 0; d < D && stride <= hashmap_size; d++) {
+    index += pos_grid[d] * stride;
+    stride *= align_corners ? resolution : (resolution + 1);
+  }
+  // NOTE: for NeRF, the hash is in fact not necessary. Check
+  // https://github.com/NVlabs/instant-ngp/issues/97. gridtype: 0 == hash, 1 ==
+  // tiled
+  if (gridtype == 0 && stride > hashmap_size) {
+    index = fast_hash<D>(pos_grid);
+  }
+  return (index % hashmap_size) * C + ch;
+}
+template <typename scalar_t, uint32_t D, uint32_t C>
+__global__ void
+kernel_grid(const float *__restrict__ inputs, const scalar_t *__restrict__ grid,
+            const int *__restrict__ offsets, scalar_t *__restrict__ outputs,
+            const uint32_t B, const uint32_t L, const float S, const uint32_t H,
+            const bool calc_grad_inputs, scalar_t *__restrict__ dy_dx,
+            const uint32_t gridtype, const bool align_corners) {
+  const uint32_t b = blockIdx.x * blockDim.x + threadIdx.x;
+  if (b >= B)
+    return;
+  const uint32_t level = blockIdx.y;
+  // locate
+  grid += (uint32_t)offsets[level] * C;
+  inputs += b * D;
+  outputs += level * B * C + b * C;
+  // check input range (should be in [0, 1])
+  bool flag_oob = false;
+#pragma unroll
+  for (uint32_t d = 0; d < D; d++) {
+    if (inputs[d] < 0 || inputs[d] > 1) {
+      flag_oob = true;
+    }
+  }
+  // if input out of bound, just set output to 0
+  if (flag_oob) {
+#pragma unroll
+    for (uint32_t ch = 0; ch < C; ch++) {
+      outputs[ch] = 0;
+    }
+    if (calc_grad_inputs) {
+      dy_dx += b * D * L * C + level * D * C; // B L D C
+#pragma unroll
+      for (uint32_t d = 0; d < D; d++) {
+#pragma unroll
+        for (uint32_t ch = 0; ch < C; ch++) {
+          dy_dx[d * C + ch] = 0;
+        }
+      }
+    }
+    return;
+  }
+  const uint32_t hashmap_size = offsets[level + 1] - offsets[level];
+  const float scale = exp2f(level * S) * H - 1.0f;
+  const uint32_t resolution = (uint32_t)ceil(scale) + 1;
+  // calculate coordinate
+  float pos[D];
+  uint32_t pos_grid[D];
+#pragma unroll
+  for (uint32_t d = 0; d < D; d++) {
+    pos[d] = inputs[d] * scale + (align_corners ? 0.0f : 0.5f);
+    pos_grid[d] = floorf(pos[d]);
+    pos[d] -= (float)pos_grid[d];
+  }
+  // printf("[b=%d, l=%d] pos=(%f, %f)+(%d, %d)\n", b, level, pos[0], pos[1],
+  // pos_grid[0], pos_grid[1]);
+  // interpolate
+  scalar_t results[C] = {0}; // temp results in register
+#pragma unroll
+  for (uint32_t idx = 0; idx < (1 << D); idx++) {
+    float w = 1;
+    uint32_t pos_grid_local[D];
+#pragma unroll
+    for (uint32_t d = 0; d < D; d++) {
+      if ((idx & (1 << d)) == 0) {
+        w *= 1 - pos[d];
+        pos_grid_local[d] = pos_grid[d];
+      } else {
+        w *= pos[d];
+        pos_grid_local[d] = pos_grid[d] + 1;
+      }
+    }
+    uint32_t index = get_grid_index<D, C>(
+        gridtype, align_corners, 0, hashmap_size, resolution, pos_grid_local);
+// writing to register (fast)
+#pragma unroll
+    for (uint32_t ch = 0; ch < C; ch++) {
+      results[ch] += w * grid[index + ch];
+    }
+    // printf("[b=%d, l=%d] int %d, idx %d, w %f, val %f\n", b, level, idx,
+    // index, w, grid[index]);
+  }
+// writing to global memory (slow)
+#pragma unroll
+  for (uint32_t ch = 0; ch < C; ch++) {
+    outputs[ch] = results[ch];
+  }
+  // prepare dy_dx for calc_grad_inputs
+  // differentiable (soft) indexing:
+  // https://discuss.pytorch.org/t/differentiable-indexing/17647/9
+  if (calc_grad_inputs) {
+    dy_dx += b * D * L * C + level * D * C; // B L D C
+#pragma unroll
+    for (uint32_t gd = 0; gd < D; gd++) {
+      scalar_t results_grad[C] = {0};
+#pragma unroll
+      for (uint32_t idx = 0; idx < (1 << (D - 1)); idx++) {
+        float w = scale;
+        uint32_t pos_grid_local[D];
+#pragma unroll
+        for (uint32_t nd = 0; nd < D - 1; nd++) {
+          const uint32_t d = (nd >= gd) ? (nd + 1) : nd;
+          if ((idx & (1 << nd)) == 0) {
+            w *= 1 - pos[d];
+            pos_grid_local[d] = pos_grid[d];
+          } else {
+            w *= pos[d];
+            pos_grid_local[d] = pos_grid[d] + 1;
+          }
+        }
+        pos_grid_local[gd] = pos_grid[gd];
+        uint32_t index_left =
+            get_grid_index<D, C>(gridtype, align_corners, 0, hashmap_size,
+                                 resolution, pos_grid_local);
+        pos_grid_local[gd] = pos_grid[gd] + 1;
+        uint32_t index_right =
+            get_grid_index<D, C>(gridtype, align_corners, 0, hashmap_size,
+                                 resolution, pos_grid_local);
+#pragma unroll
+        for (uint32_t ch = 0; ch < C; ch++) {
+          results_grad[ch] +=
+              w * (grid[index_right + ch] - grid[index_left + ch]);
+        }
+      }
+#pragma unroll
+      for (uint32_t ch = 0; ch < C; ch++) {
+        dy_dx[gd * C + ch] = results_grad[ch];
+      }
+    }
+  }
+}
+template <typename scalar_t, uint32_t D, uint32_t C, uint32_t N_C>
+__global__ void kernel_grid_backward(
+    const scalar_t *__restrict__ grad, const float *__restrict__ inputs,
+    const scalar_t *__restrict__ grid, const int *__restrict__ offsets,
+    scalar_t *__restrict__ grad_grid, const uint32_t B, const uint32_t L,
+    const float S, const uint32_t H, const uint32_t gridtype,
+    const bool align_corners) {
+  const uint32_t b = (blockIdx.x * blockDim.x + threadIdx.x) * N_C / C;
+  if (b >= B)
+    return;
+  const uint32_t level = blockIdx.y;
+  const uint32_t ch = (blockIdx.x * blockDim.x + threadIdx.x) * N_C - b * C;
+  // locate
+  grad_grid += offsets[level] * C;
+  inputs += b * D;
+  grad += level * B * C + b * C + ch; // L, B, C
+  const uint32_t hashmap_size = offsets[level + 1] - offsets[level];
+  const float scale = exp2f(level * S) * H - 1.0f;
+  const uint32_t resolution = (uint32_t)ceil(scale) + 1;
+// check input range (should be in [0, 1])
+#pragma unroll
+  for (uint32_t d = 0; d < D; d++) {
+    if (inputs[d] < 0 || inputs[d] > 1) {
+      return; // grad is init as 0, so we simply return.
+    }
+  }
+  // calculate coordinate
+  float pos[D];
+  uint32_t pos_grid[D];
+#pragma unroll
+  for (uint32_t d = 0; d < D; d++) {
+    pos[d] = inputs[d] * scale + (align_corners ? 0.0f : 0.5f);
+    pos_grid[d] = floorf(pos[d]);
+    pos[d] -= (float)pos_grid[d];
+  }
+  scalar_t grad_cur[N_C] = {0}; // fetch to register
+#pragma unroll
+  for (uint32_t c = 0; c < N_C; c++) {
+    grad_cur[c] = grad[c];
+  }
+// interpolate
+#pragma unroll
+  for (uint32_t idx = 0; idx < (1 << D); idx++) {
+    float w = 1;
+    uint32_t pos_grid_local[D];
+#pragma unroll
+    for (uint32_t d = 0; d < D; d++) {
+      if ((idx & (1 << d)) == 0) {
+        w *= 1 - pos[d];
+        pos_grid_local[d] = pos_grid[d];
+      } else {
+        w *= pos[d];
+        pos_grid_local[d] = pos_grid[d] + 1;
+      }
+    }
+    uint32_t index = get_grid_index<D, C>(
+        gridtype, align_corners, ch, hashmap_size, resolution, pos_grid_local);
+    // atomicAdd for __half is slow (especially for large values), so we use
+    // __half2 if N_C % 2 == 0
+    // TODO: use float which is better than __half, if N_C % 2 != 0
+    if (std::is_same<scalar_t, at::Half>::value && N_C % 2 == 0) {
+#pragma unroll
+      for (uint32_t c = 0; c < N_C; c += 2) {
+        // process two __half at once (by interpreting as a __half2)
+        __half2 v = {(__half)(w * grad_cur[c]), (__half)(w * grad_cur[c + 1])};
+        atomicAdd((__half2 *)&grad_grid[index + c], v);
+      }
+      // float, or __half when N_C % 2 != 0 (which means C == 1)
+    } else {
+#pragma unroll
+      for (uint32_t c = 0; c < N_C; c++) {
+        atomicAdd(&grad_grid[index + c], w * grad_cur[c]);
+      }
+    }
+  }
+}
+template <typename scalar_t, uint32_t D, uint32_t C>
+__global__ void kernel_input_backward(const scalar_t *__restrict__ grad,
+                                      const scalar_t *__restrict__ dy_dx,
+                                      scalar_t *__restrict__ grad_inputs,
+                                      uint32_t B, uint32_t L) {
+  const uint32_t t = threadIdx.x + blockIdx.x * blockDim.x;
+  if (t >= B * D)
+    return;
+  const uint32_t b = t / D;
+  const uint32_t d = t - b * D;
+  dy_dx += b * L * D * C;
+  scalar_t result = 0;
+#pragma unroll
+  for (int l = 0; l < L; l++) {
+#pragma unroll
+    for (int ch = 0; ch < C; ch++) {
+      result += grad[l * B * C + b * C + ch] * dy_dx[l * D * C + d * C + ch];
+    }
+  }
+  grad_inputs[t] = result;
+}
+template <typename scalar_t, uint32_t D>
+void kernel_grid_wrapper(const float *inputs, const scalar_t *embeddings,
+                         const int *offsets, scalar_t *outputs,
+                         const uint32_t B, const uint32_t C, const uint32_t L,
+                         const float S, const uint32_t H,
+                         const bool calc_grad_inputs, scalar_t *dy_dx,
+                         const uint32_t gridtype, const bool align_corners) {
+  static constexpr uint32_t N_THREAD = 512;
+  const dim3 blocks_hashgrid = {div_round_up(B, N_THREAD), L, 1};
+  switch (C) {
+  case 1:
+    kernel_grid<scalar_t, D, 1><<<blocks_hashgrid, N_THREAD>>>(
+        inputs, embeddings, offsets, outputs, B, L, S, H, calc_grad_inputs,
+        dy_dx, gridtype, align_corners);
+    break;
+  case 2:
+    kernel_grid<scalar_t, D, 2><<<blocks_hashgrid, N_THREAD>>>(
+        inputs, embeddings, offsets, outputs, B, L, S, H, calc_grad_inputs,
+        dy_dx, gridtype, align_corners);
+    break;
+  case 4:
+    kernel_grid<scalar_t, D, 4><<<blocks_hashgrid, N_THREAD>>>(
+        inputs, embeddings, offsets, outputs, B, L, S, H, calc_grad_inputs,
+        dy_dx, gridtype, align_corners);
+    break;
+  case 8:
+    kernel_grid<scalar_t, D, 8><<<blocks_hashgrid, N_THREAD>>>(
+        inputs, embeddings, offsets, outputs, B, L, S, H, calc_grad_inputs,
+        dy_dx, gridtype, align_corners);
+    break;
+  default:
+    throw std::runtime_error{"GridEncoding: C must be 1, 2, 4, or 8."};
+  }
+}
+// inputs: [B, D], float, in [0, 1]
+// embeddings: [sO, C], float
+// offsets: [L + 1], uint32_t
+// outputs: [L, B, C], float (L first, so only one level of hashmap needs to fit
+// into cache at a time.) H: base resolution dy_dx: [B, L * D * C]
+template <typename scalar_t>
+void grid_encode_forward_cuda(const float *inputs, const scalar_t *embeddings,
+                              const int *offsets, scalar_t *outputs,
+                              const uint32_t B, const uint32_t D,
+                              const uint32_t C, const uint32_t L, const float S,
+                              const uint32_t H, const bool calc_grad_inputs,
+                              scalar_t *dy_dx, const uint32_t gridtype,
+                              const bool align_corners) {
+  switch (D) {
+  case 2:
+    kernel_grid_wrapper<scalar_t, 2>(inputs, embeddings, offsets, outputs, B, C,
+                                     L, S, H, calc_grad_inputs, dy_dx, gridtype,
+                                     align_corners);
+    break;
+  case 3:
+    kernel_grid_wrapper<scalar_t, 3>(inputs, embeddings, offsets, outputs, B, C,
+                                     L, S, H, calc_grad_inputs, dy_dx, gridtype,
+                                     align_corners);
+    break;
+  case 4:
+    kernel_grid_wrapper<scalar_t, 4>(inputs, embeddings, offsets, outputs, B, C,
+                                     L, S, H, calc_grad_inputs, dy_dx, gridtype,
+                                     align_corners);
+    break;
+  case 5:
+    kernel_grid_wrapper<scalar_t, 5>(inputs, embeddings, offsets, outputs, B, C,
+                                     L, S, H, calc_grad_inputs, dy_dx, gridtype,
+                                     align_corners);
+    break;
+  default:
+    throw std::runtime_error{"GridEncoding: C must be 1, 2, 4, or 8."};
+  }
+}
+template <typename scalar_t, uint32_t D>
+void kernel_grid_backward_wrapper(
+    const scalar_t *grad, const float *inputs, const scalar_t *embeddings,
+    const int *offsets, scalar_t *grad_embeddings, const uint32_t B,
+    const uint32_t C, const uint32_t L, const float S, const uint32_t H,
+    const bool calc_grad_inputs, scalar_t *dy_dx, scalar_t *grad_inputs,
+    const uint32_t gridtype, const bool align_corners) {
+  static constexpr uint32_t N_THREAD = 256;
+  const uint32_t N_C = std::min(2u, C); // n_features_per_thread
+  const dim3 blocks_hashgrid = {div_round_up(B * C / N_C, N_THREAD), L, 1};
+  switch (C) {
+  case 1:
+    kernel_grid_backward<scalar_t, D, 1, 1><<<blocks_hashgrid, N_THREAD>>>(
+        grad, inputs, embeddings, offsets, grad_embeddings, B, L, S, H,
+        gridtype, align_corners);
+    if (calc_grad_inputs)
+      kernel_input_backward<scalar_t, D, 1>
+          <<<div_round_up(B * D, N_THREAD), N_THREAD>>>(grad, dy_dx,
+                                                        grad_inputs, B, L);
+    break;
+  case 2:
+    kernel_grid_backward<scalar_t, D, 2, 2><<<blocks_hashgrid, N_THREAD>>>(
+        grad, inputs, embeddings, offsets, grad_embeddings, B, L, S, H,
+        gridtype, align_corners);
+    if (calc_grad_inputs)
+      kernel_input_backward<scalar_t, D, 2>
+          <<<div_round_up(B * D, N_THREAD), N_THREAD>>>(grad, dy_dx,
+                                                        grad_inputs, B, L);
+    break;
+  case 4:
+    kernel_grid_backward<scalar_t, D, 4, 2><<<blocks_hashgrid, N_THREAD>>>(
+        grad, inputs, embeddings, offsets, grad_embeddings, B, L, S, H,
+        gridtype, align_corners);
+    if (calc_grad_inputs)
+      kernel_input_backward<scalar_t, D, 4>
+          <<<div_round_up(B * D, N_THREAD), N_THREAD>>>(grad, dy_dx,
+                                                        grad_inputs, B, L);
+    break;
+  case 8:
+    kernel_grid_backward<scalar_t, D, 8, 2><<<blocks_hashgrid, N_THREAD>>>(
+        grad, inputs, embeddings, offsets, grad_embeddings, B, L, S, H,
+        gridtype, align_corners);
+    if (calc_grad_inputs)
+      kernel_input_backward<scalar_t, D, 8>
+          <<<div_round_up(B * D, N_THREAD), N_THREAD>>>(grad, dy_dx,
+                                                        grad_inputs, B, L);
+    break;
+  default:
+    throw std::runtime_error{"GridEncoding: C must be 1, 2, 4, or 8."};
+  }
+}
+// grad: [L, B, C], float
+// inputs: [B, D], float, in [0, 1]
+// embeddings: [sO, C], float
+// offsets: [L + 1], uint32_t
+// grad_embeddings: [sO, C]
+// H: base resolution
+template <typename scalar_t>
+void grid_encode_backward_cuda(
+    const scalar_t *grad, const float *inputs, const scalar_t *embeddings,
+    const int *offsets, scalar_t *grad_embeddings, const uint32_t B,
+    const uint32_t D, const uint32_t C, const uint32_t L, const float S,
+    const uint32_t H, const bool calc_grad_inputs, scalar_t *dy_dx,
+    scalar_t *grad_inputs, const uint32_t gridtype, const bool align_corners) {
+  switch (D) {
+  case 2:
+    kernel_grid_backward_wrapper<scalar_t, 2>(
+        grad, inputs, embeddings, offsets, grad_embeddings, B, C, L, S, H,
+        calc_grad_inputs, dy_dx, grad_inputs, gridtype, align_corners);
+    break;
+  case 3:
+    kernel_grid_backward_wrapper<scalar_t, 3>(
+        grad, inputs, embeddings, offsets, grad_embeddings, B, C, L, S, H,
+        calc_grad_inputs, dy_dx, grad_inputs, gridtype, align_corners);
+    break;
+  case 4:
+    kernel_grid_backward_wrapper<scalar_t, 4>(
+        grad, inputs, embeddings, offsets, grad_embeddings, B, C, L, S, H,
+        calc_grad_inputs, dy_dx, grad_inputs, gridtype, align_corners);
+    break;
+  case 5:
+    kernel_grid_backward_wrapper<scalar_t, 5>(
+        grad, inputs, embeddings, offsets, grad_embeddings, B, C, L, S, H,
+        calc_grad_inputs, dy_dx, grad_inputs, gridtype, align_corners);
+    break;
+  default:
+    throw std::runtime_error{"GridEncoding: C must be 1, 2, 4, or 8."};
+  }
+}
+void grid_encode_forward(const at::Tensor inputs, const at::Tensor embeddings,
+                         const at::Tensor offsets, at::Tensor outputs,
+                         const uint32_t B, const uint32_t D, const uint32_t C,
+                         const uint32_t L, const float S, const uint32_t H,
+                         const bool calc_grad_inputs, at::Tensor dy_dx,
+                         const uint32_t gridtype, const bool align_corners) {
+  CHECK_CUDA(inputs);
+  CHECK_CUDA(embeddings);
+  CHECK_CUDA(offsets);
+  CHECK_CUDA(outputs);
+  CHECK_CUDA(dy_dx);
+  CHECK_CONTIGUOUS(inputs);
+  CHECK_CONTIGUOUS(embeddings);
+  CHECK_CONTIGUOUS(offsets);
+  CHECK_CONTIGUOUS(outputs);
+  CHECK_CONTIGUOUS(dy_dx);
+  CHECK_IS_FLOATING(inputs);
+  CHECK_IS_FLOATING(embeddings);
+  CHECK_IS_INT(offsets);
+  CHECK_IS_FLOATING(outputs);
+  CHECK_IS_FLOATING(dy_dx);
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      embeddings.scalar_type(), "grid_encode_forward", ([&] {
+        grid_encode_forward_cuda<scalar_t>(
+            inputs.data_ptr<float>(), embeddings.data_ptr<scalar_t>(),
+            offsets.data_ptr<int>(), outputs.data_ptr<scalar_t>(), B, D, C, L,
+            S, H, calc_grad_inputs, dy_dx.data_ptr<scalar_t>(), gridtype,
+            align_corners);
+      }));
+}
+void grid_encode_backward(const at::Tensor grad, const at::Tensor inputs,
+                          const at::Tensor embeddings, const at::Tensor offsets,
+                          at::Tensor grad_embeddings, const uint32_t B,
+                          const uint32_t D, const uint32_t C, const uint32_t L,
+                          const float S, const uint32_t H,
+                          const bool calc_grad_inputs, const at::Tensor dy_dx,
+                          at::Tensor grad_inputs, const uint32_t gridtype,
+                          const bool align_corners) {
+  CHECK_CUDA(grad);
+  CHECK_CUDA(inputs);
+  CHECK_CUDA(embeddings);
+  CHECK_CUDA(offsets);
+  CHECK_CUDA(grad_embeddings);
+  CHECK_CUDA(dy_dx);
+  CHECK_CUDA(grad_inputs);
+  CHECK_CONTIGUOUS(grad);
+  CHECK_CONTIGUOUS(inputs);
+  CHECK_CONTIGUOUS(embeddings);
+  CHECK_CONTIGUOUS(offsets);
+  CHECK_CONTIGUOUS(grad_embeddings);
+  CHECK_CONTIGUOUS(dy_dx);
+  CHECK_CONTIGUOUS(grad_inputs);
+  CHECK_IS_FLOATING(grad);
+  CHECK_IS_FLOATING(inputs);
+  CHECK_IS_FLOATING(embeddings);
+  CHECK_IS_INT(offsets);
+  CHECK_IS_FLOATING(grad_embeddings);
+  CHECK_IS_FLOATING(dy_dx);
+  CHECK_IS_FLOATING(grad_inputs);
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      grad.scalar_type(), "grid_encode_backward", ([&] {
+        grid_encode_backward_cuda<scalar_t>(
+            grad.data_ptr<scalar_t>(), inputs.data_ptr<float>(),
+            embeddings.data_ptr<scalar_t>(), offsets.data_ptr<int>(),
+            grad_embeddings.data_ptr<scalar_t>(), B, D, C, L, S, H,
+            calc_grad_inputs, dy_dx.data_ptr<scalar_t>(),
+            grad_inputs.data_ptr<scalar_t>(), gridtype, align_corners);
+      }));
+}

gaussiancity/extensions/grid_encoder/setup.py ADDED Viewed

	@@ -0,0 +1,39 @@

+# -*- coding: utf-8 -*-
+#
+# @File:   setup.py
+# @Author: Jiaxiang Tang (@ashawkey)
+# @Date:   2023-04-15 10:33:32
+# @Last Modified by: Haozhe Xie
+# @Last Modified at: 2024-09-18 10:08:45
+# @Email:  ashawkey1999@gmail.com
+# @Ref: https://github.com/ashawkey/torch-ngp
+from setuptools import setup
+from torch.utils.cpp_extension import BuildExtension, CUDAExtension
+setup(
+    name="grid_encoder",
+    version="1.0.0",
+    ext_modules=[
+        CUDAExtension(
+            name="grid_encoder_ext",
+            sources=[
+                "grid_encoder_ext.cu",
+                "bindings.cpp",
+            ],
+            extra_compile_args={
+                "cxx": ["-O3", "-std=c++17"],
+                "nvcc": [
+                    "-O3",
+                    "-std=c++17",
+                    "-U__CUDA_NO_HALF_OPERATORS__",
+                    "-U__CUDA_NO_HALF_CONVERSIONS__",
+                    "-U__CUDA_NO_HALF2_OPERATORS__",
+                ],
+            },
+        ),
+    ],
+    cmdclass={
+        "build_ext": BuildExtension,
+    },
+)

gaussiancity/extensions/voxlib/__init__.py ADDED Viewed

	@@ -0,0 +1,12 @@

+# -*- coding: utf-8 -*-
+#
+# @File:   setup.py
+# @Author: NVIDIA Corporation
+# @Date:   2021-10-13 00:00:00
+# @Last Modified by: Haozhe Xie
+# @Last Modified at: 2024-10-13 03:14:15
+# @Email:  root@haozhexie.com
+from voxlib import ray_voxel_intersection_perspective
+from voxlib import points_to_volume
+from voxlib import maps_to_volume

gaussiancity/extensions/voxlib/bindings.cpp ADDED Viewed

	@@ -0,0 +1,41 @@

+/**
+ * @File:   bindings.cpp
+ * @Author: NVIDIA Corporation
+ * @Date:   2021-10-13 00:00:00
+ * @Last Modified by: Haozhe Xie
+ * @Last Modified at: 2024-10-13 03:03:45
+ * @Email:  root@haozhexie.com
+ */
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <torch/extension.h>
+#include <vector>
+// Fast voxel traversal along rays
+std::vector<torch::Tensor> ray_voxel_intersection_perspective_cuda(
+    const torch::Tensor &in_voxel, const torch::Tensor &cam_ori,
+    const torch::Tensor &cam_dir, const torch::Tensor &cam_up, float cam_f,
+    const std::vector<float> &cam_c, const std::vector<int> &img_dims,
+    int max_samples);
+torch::Tensor points_to_volume_cuda(const torch::Tensor &points,
+                                    const torch::Tensor &pt_ids,
+                                    const torch::Tensor &scales, int h, int w,
+                                    int d);
+torch::Tensor
+maps_to_volume_cuda(const torch::Tensor &inst_map, const torch::Tensor &td_hf,
+                    const torch::Tensor &bu_hf,
+                    const torch::Tensor &pts_map,
+                    const torch::Tensor &scales);
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("ray_voxel_intersection_perspective",
+        &ray_voxel_intersection_perspective_cuda,
+        "Ray-voxel intersections given perspective camera parameters (CUDA)");
+  m.def("points_to_volume", &points_to_volume_cuda,
+        "Generate 3D volume from points (CUDA)");
+  m.def("maps_to_volume", &maps_to_volume_cuda,
+        "Generate 3D volume from maps (CUDA)");
+}

gaussiancity/extensions/voxlib/maps_to_volume.cu ADDED Viewed

	@@ -0,0 +1,142 @@

+/**
+ * @File:   maps_to_volume.cu
+ * @Author: Haozhe Xie
+ * @Date:   2024-10-09 15:42:49
+ * @Last Modified by: Haozhe Xie
+ * @Last Modified at: 2024-10-13 12:26:15
+ * @Email:  root@haozhexie.com
+ */
+#include <ATen/cuda/CUDAContext.h>
+#include <torch/extension.h>
+#include "voxlib_common.h"
+#define TILE_DIM 16
+#define BLDG_MAX_HEIGHT 504
+#define BLDG_INS_MIN_ID 10
+#define BLDG_FACADE_SEM 2
+#define BLDG_ROOF_OFFSET 1
+__global__ void maps_to_volume_cuda_kernel(int height, int width, int depth,
+                                           const int8_t *__restrict__ scales,
+                                           const short *__restrict__ inst_map,
+                                           const short *__restrict__ td_hf,
+                                           const short *__restrict__ bu_hf,
+                                           const bool *__restrict__ pts_map,
+                                           short *__restrict__ volume) {
+  size_t i = blockIdx.x * blockDim.x + threadIdx.x; // width
+  size_t j = blockIdx.y * blockDim.y + threadIdx.y; // height
+  if (i < width && j < height) {
+    bool has_pt = pts_map[j * width + i];
+    if (!has_pt) {
+      return;
+    }
+    // Fix: nonzero is not supported for tensors with more than INT_MAX elements
+    short hgt_up = td_hf[j * width + i];
+    short hgt_lw = bu_hf[j * width + i];
+    short inst = inst_map[j * width + i];
+    // WARN: The semantic labels for buildings would be merged to facade.
+    short sem_cls = inst < BLDG_INS_MIN_ID ? inst : BLDG_FACADE_SEM;
+    short scale = scales[sem_cls];
+    int64_t vol_offset = static_cast<int64_t>(j) * width * depth + i * depth;
+    for (int k = hgt_lw; k <= hgt_up; k += scale) {
+      // Make all objects hallow
+      bool is_border_1 = (k > hgt_up - scale) || (i < scale) ||
+                         (i >= width - scale - 1) || (j < scale) ||
+                         (j >= height - scale - 1);
+      bool is_border_2 = false;
+      bool is_border_3 = false;
+      if (!is_border_1) {
+        // Check is_border_1 to Prevent OOB
+        short nbr_hd_hf[8] = {
+            td_hf[(j - scale) * width + (i - scale)],
+            td_hf[(j - scale) * width + i],
+            td_hf[(j - scale) * width + (i + scale)],
+            td_hf[j * width + (i - scale)],
+            td_hf[j * width + (i + scale)],
+            td_hf[(j + scale) * width + (i - scale)],
+            td_hf[(j + scale) * width + i],
+            td_hf[(j + scale) * width + (i + scale)],
+        };
+        for (int ni = 0; ni < 8; ++ni) {
+          if (nbr_hd_hf[ni] != hgt_up) {
+            is_border_2 = true;
+            break;
+          }
+        }
+        short nbr_inst[8] = {
+            inst_map[(j - scale) * width + (i - scale)],
+            inst_map[(j - scale) * width + i],
+            inst_map[(j - scale) * width + (i + scale)],
+            inst_map[j * width + (i - scale)],
+            inst_map[j * width + (i + scale)],
+            inst_map[(j + scale) * width + (i - scale)],
+            inst_map[(j + scale) * width + i],
+            inst_map[(j + scale) * width + (i + scale)],
+        };
+        for (int ni = 0; ni < 8; ++ni) {
+          if (nbr_inst[ni] != inst) {
+            is_border_3 = true;
+            break;
+          }
+        }
+      }
+      if (!is_border_1 && !is_border_2 && !is_border_3) {
+        continue;
+      }
+      // Building Roof Handler (Recover roof instance ID)
+      if (k > hgt_up - scale && sem_cls == BLDG_FACADE_SEM) {
+        volume[vol_offset + k] = inst + 1;
+      } else {
+        volume[vol_offset + k] = inst;
+      }
+    }
+  }
+}
+torch::Tensor maps_to_volume_cuda(const torch::Tensor &inst_map,
+                                  const torch::Tensor &td_hf,
+                                  const torch::Tensor &bu_hf,
+                                  const torch::Tensor &pts_map,
+                                  const torch::Tensor &scales) {
+  CHECK_CUDA(inst_map);
+  CHECK_CUDA(td_hf);
+  CHECK_CUDA(bu_hf);
+  CHECK_CUDA(pts_map);
+  CHECK_CUDA(scales);
+  int curDevice = -1;
+  cudaGetDevice(&curDevice);
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream(curDevice);
+  torch::Device device = inst_map.device();
+  int height = inst_map.size(0);
+  int width = inst_map.size(1);
+  int depth = BLDG_MAX_HEIGHT;
+  dim3 blockDim(TILE_DIM, TILE_DIM);
+  dim3 gridDim((width + blockDim.x - 1) / blockDim.x,
+               (height + blockDim.y - 1) / blockDim.y);
+  torch::Tensor volume =
+      torch::zeros({height, width, depth},
+                   torch::TensorOptions().dtype(torch::kInt16).device(device));
+  maps_to_volume_cuda_kernel<<<gridDim, blockDim, 0, stream>>>(
+      height, width, depth, scales.data_ptr<int8_t>(),
+      inst_map.data_ptr<short>(), td_hf.data_ptr<short>(),
+      bu_hf.data_ptr<short>(), pts_map.data_ptr<bool>(),
+      volume.data_ptr<short>());
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess) {
+    printf("Error in maps_to_volume_cuda_kernel: %s\n",
+           cudaGetErrorString(err));
+  }
+  return volume;
+}

gaussiancity/extensions/voxlib/points_to_volume.cu ADDED Viewed

	@@ -0,0 +1,79 @@

+/**
+ * @File:   points_to_volume.cu
+ * @Author: Haozhe Xie
+ * @Date:   2024-02-24 14:09:38
+ * @Last Modified by: Haozhe Xie
+ * @Last Modified at: 2024-10-13 12:29:46
+ * @Email:  root@haozhexie.com
+ */
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <ATen/cuda/CUDAContext.h>
+#include <torch/extension.h>
+#include "voxlib_common.h"
+#define THREADS_PER_BLOCK 256
+__global__ void points_to_volume_cuda_cuda_kernel(
+    size_t n_pts, int h, int w, int d, const short *__restrict__ points,
+    const int *__restrict__ pt_ids, const short *__restrict__ scales,
+    int *__restrict__ volume) {
+  size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx >= n_pts) {
+    return;
+  }
+  int pid = pt_ids[idx];
+  int idx3 = idx * 3;
+  short x = points[idx3];
+  short y = points[idx3 + 1];
+  short z = points[idx3 + 2];
+  short sx = scales[idx3];
+  short sy = scales[idx3 + 1];
+  short sz = scales[idx3 + 2];
+  if (x >= w || y >= h || z >= d || x < 0 || y < 0 || z < 0) {
+    return;
+  }
+  for (int j = x; j < x + sx && j < w; ++j) {
+    for (int k = y; k < y + sy && k < h; ++k) {
+      for (int l = z; l < z + sz && l < d; ++l) {
+        int64_t idx = static_cast<int64_t>(k) * w * d + j * d + l;
+        volume[idx] = pid;
+      }
+    }
+  }
+}
+torch::Tensor points_to_volume_cuda(const torch::Tensor &points,
+                                    const torch::Tensor &pt_ids,
+                                    const torch::Tensor &scales, int h, int w,
+                                    int d) {
+  CHECK_CUDA(points);
+  CHECK_CUDA(pt_ids);
+  CHECK_CUDA(scales);
+  size_t n_pts = points.size(0);
+  int curDevice = -1;
+  cudaGetDevice(&curDevice);
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream(curDevice);
+  torch::Device device = points.device();
+  int n_blocks = (n_pts + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
+  torch::Tensor volume = torch::zeros(
+      {h, w, d}, torch::TensorOptions().dtype(torch::kInt32).device(device));
+  points_to_volume_cuda_cuda_kernel<<<n_blocks, THREADS_PER_BLOCK, 0, stream>>>(
+      n_pts, h, w, d, points.data_ptr<short>(), pt_ids.data_ptr<int>(),
+      scales.data_ptr<short>(), volume.data_ptr<int>());
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess) {
+    printf("Error in points_to_volume_cuda_cuda_kernel: %s\n",
+           cudaGetErrorString(err));
+  }
+  return volume;
+}

gaussiancity/extensions/voxlib/ray_voxel_intersection.cu ADDED Viewed

	@@ -0,0 +1,332 @@

+/**
+ * @File:   ray_voxel_intersection.cu
+ * @Author: NVIDIA Corporation
+ * @Date:   2021-10-13 00:00:00
+ * @Last Modified by: Haozhe Xie
+ * @Last Modified at: 2024-03-27 11:02:41
+ * @Email:  root@haozhexie.com
+ */
+#include <torch/types.h>
+#include <ATen/ATen.h>
+#include <ATen/AccumulateType.h>
+#include <ATen/cuda/CUDAApplyUtils.cuh>
+#include <ATen/cuda/CUDAContext.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <curand.h>
+#include <curand_kernel.h>
+#include <time.h>
+//#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <vector>
+#include "voxlib_common.h"
+struct RVIP_Params {
+  int voxel_dims[3];
+  int voxel_strides[3];
+  int max_samples;
+  int img_dims[2];
+  // Camera parameters
+  float cam_ori[3];
+  float cam_fwd[3];
+  float cam_side[3];
+  float cam_up[3];
+  float cam_c[2];
+  float cam_f;
+  // unsigned long seed;
+};
+/*
+  out_voxel_id: torch CUDA int32  [   img_dims[0], img_dims[1], max_samples, 1]
+  out_depth:    torch CUDA float  [2, img_dims[0], img_dims[1], max_samples, 1]
+  out_raydirs:  torch CUDA float  [   img_dims[0], img_dims[1],           1, 3]
+  Image coordinates refer to the center of the pixel [0, 0, 0] at voxel
+  coordinate is at the corner of the corner block (instead of at the center)
+*/
+template <int TILE_DIM>
+static __global__ void ray_voxel_intersection_perspective_kernel(
+    int32_t *__restrict__ out_voxel_id, float *__restrict__ out_depth,
+    float *__restrict__ out_raydirs, const int32_t *__restrict__ in_voxel,
+    const RVIP_Params p) {
+  int img_coords[2];
+  img_coords[1] = blockIdx.x * TILE_DIM + threadIdx.x;
+  img_coords[0] = blockIdx.y * TILE_DIM + threadIdx.y;
+  if (img_coords[0] >= p.img_dims[0] || img_coords[1] >= p.img_dims[1]) {
+    return;
+  }
+  int pix_index = img_coords[0] * p.img_dims[1] + img_coords[1];
+  // Calculate ray origin and direction
+  float rayori[3], raydir[3];
+  rayori[0] = p.cam_ori[0];
+  rayori[1] = p.cam_ori[1];
+  rayori[2] = p.cam_ori[2];
+  // Camera intrinsics
+  float ndc_imcoords[2];
+  ndc_imcoords[0] = p.cam_c[0] - (float)img_coords[0]; // Flip height
+  ndc_imcoords[1] = (float)img_coords[1] - p.cam_c[1];
+  raydir[0] = p.cam_up[0] * ndc_imcoords[0] + p.cam_side[0] * ndc_imcoords[1] +
+              p.cam_fwd[0] * p.cam_f;
+  raydir[1] = p.cam_up[1] * ndc_imcoords[0] + p.cam_side[1] * ndc_imcoords[1] +
+              p.cam_fwd[1] * p.cam_f;
+  raydir[2] = p.cam_up[2] * ndc_imcoords[0] + p.cam_side[2] * ndc_imcoords[1] +
+              p.cam_fwd[2] * p.cam_f;
+  normalize<float, 3>(raydir);
+  // Save out_raydirs
+  out_raydirs[pix_index * 3] = raydir[0];
+  out_raydirs[pix_index * 3 + 1] = raydir[1];
+  out_raydirs[pix_index * 3 + 2] = raydir[2];
+  float axis_t[3];
+  int axis_int[3];
+  // int axis_intbound[3];
+  // Current voxel
+  axis_int[0] = floorf(rayori[0]);
+  axis_int[1] = floorf(rayori[1]);
+  axis_int[2] = floorf(rayori[2]);
+#pragma unroll
+  for (int i = 0; i < 3; i++) {
+    if (raydir[i] > 0) {
+      // Initial t value
+      // Handle boundary case where rayori[i] is a whole number. Always round Up
+      // for the next block
+      // axis_t[i] = (ceilf(nextafterf(rayori[i], HUGE_VALF)) - rayori[i]) /
+      // raydir[i];
+      axis_t[i] = ((float)(axis_int[i] + 1) - rayori[i]) / raydir[i];
+    } else if (raydir[i] < 0) {
+      axis_t[i] = ((float)axis_int[i] - rayori[i]) / raydir[i];
+    } else {
+      axis_t[i] = HUGE_VALF;
+    }
+  }
+  // Fused raymarching and sampling
+  bool quit = false;
+  for (int cur_plane = 0; cur_plane < p.max_samples;
+       cur_plane++) { // Last cycle is for calculating p2
+    float t = nanf("0");
+    float t2 = nanf("0");
+    int32_t blk_id = 0;
+    // Find the next intersection
+    while (!quit) {
+      // Find the next smallest t
+      float tnow;
+      // Hand unroll
+      if (axis_t[0] <= axis_t[1] && axis_t[0] <= axis_t[2]) {
+        // Update current t
+        tnow = axis_t[0];
+        // Update t candidates
+        if (raydir[0] > 0) {
+          axis_int[0] += 1;
+          if (axis_int[0] >= p.voxel_dims[0]) {
+            quit = true;
+          }
+          axis_t[0] = ((float)(axis_int[0] + 1) - rayori[0]) / raydir[0];
+        } else {
+          axis_int[0] -= 1;
+          if (axis_int[0] < 0) {
+            quit = true;
+          }
+          axis_t[0] = ((float)axis_int[0] - rayori[0]) / raydir[0];
+        }
+      } else if (axis_t[1] <= axis_t[2]) {
+        tnow = axis_t[1];
+        if (raydir[1] > 0) {
+          axis_int[1] += 1;
+          if (axis_int[1] >= p.voxel_dims[1]) {
+            quit = true;
+          }
+          axis_t[1] = ((float)(axis_int[1] + 1) - rayori[1]) / raydir[1];
+        } else {
+          axis_int[1] -= 1;
+          if (axis_int[1] < 0) {
+            quit = true;
+          }
+          axis_t[1] = ((float)axis_int[1] - rayori[1]) / raydir[1];
+        }
+      } else {
+        tnow = axis_t[2];
+        if (raydir[2] > 0) {
+          axis_int[2] += 1;
+          if (axis_int[2] >= p.voxel_dims[2]) {
+            quit = true;
+          }
+          axis_t[2] = ((float)(axis_int[2] + 1) - rayori[2]) / raydir[2];
+        } else {
+          axis_int[2] -= 1;
+          if (axis_int[2] < 0) {
+            quit = true;
+          }
+          axis_t[2] = ((float)axis_int[2] - rayori[2]) / raydir[2];
+        }
+      }
+      if (quit) {
+        break;
+      }
+      // Skip empty space
+      // Could there be deadlock if the ray direction is away from the world?
+      if (axis_int[0] < 0 || axis_int[0] >= p.voxel_dims[0] ||
+          axis_int[1] < 0 || axis_int[1] >= p.voxel_dims[1] ||
+          axis_int[2] < 0 || axis_int[2] >= p.voxel_dims[2]) {
+        continue;
+      }
+      // Test intersection using voxel grid
+      int64_t in_voxel_idx =
+          static_cast<int64_t>(axis_int[0]) * p.voxel_strides[0] +
+          static_cast<int64_t>(axis_int[1]) * p.voxel_strides[1] +
+          static_cast<int64_t>(axis_int[2]) * p.voxel_strides[2];
+      blk_id = in_voxel[in_voxel_idx];
+      if (blk_id == 0) {
+        continue;
+      }
+      // Now that there is an intersection
+      t = tnow;
+      // Calculate t2
+      /*
+      #pragma unroll
+      for (int i=0; i<3; i++) {
+          if (axis_t[i] <= axis_t[(i+1)%3] && axis_t[i] <= axis_t[(i+2)%3]) {
+              t2 = axis_t[i];
+              break;
+          }
+      }
+      */
+      // Hand unroll
+      if (axis_t[0] <= axis_t[1] && axis_t[0] <= axis_t[2]) {
+        t2 = axis_t[0];
+      } else if (axis_t[1] <= axis_t[2]) {
+        t2 = axis_t[1];
+      } else {
+        t2 = axis_t[2];
+      }
+      break;
+    } // while !quit (ray marching loop)
+    out_depth[pix_index * p.max_samples + cur_plane] = t;
+    out_depth[p.img_dims[0] * p.img_dims[1] * p.max_samples +
+              pix_index * p.max_samples + cur_plane] = t2;
+    out_voxel_id[pix_index * p.max_samples + cur_plane] = blk_id;
+  } // cur_plane
+}
+/*
+  out:
+   out_voxel_id: torch CUDA int32  [   img_dims[0], img_dims[1], max_samples, 1]
+   out_depth:    torch CUDA float  [2, img_dims[0], img_dims[1], max_samples, 1]
+   out_raydirs:  torch CUDA float  [   img_dims[0], img_dims[1], 1, 3]
+  in:
+   in_voxel:     torch CUDA int32  [X, Y, Z] [40, 512, 512]
+   cam_ori:      torch float [3]
+   cam_dir:      torch float [3]
+   cam_up:       torch float [3]
+   cam_f:        float
+   cam_c:        int    [2]
+   img_dims:     int    [2]
+   max_samples:  int
+*/
+std::vector<torch::Tensor> ray_voxel_intersection_perspective_cuda(
+    const torch::Tensor &in_voxel, const torch::Tensor &cam_ori,
+    const torch::Tensor &cam_dir, const torch::Tensor &cam_up, float cam_f,
+    const std::vector<float> &cam_c, const std::vector<int> &img_dims,
+    int max_samples) {
+  CHECK_CUDA(in_voxel);
+  int curDevice = -1;
+  cudaGetDevice(&curDevice);
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream(curDevice);
+  torch::Device device = in_voxel.device();
+  assert(in_voxel.dtype() == torch::kInt32); // Minecraft compatibility
+  assert(in_voxel.dim() == 3);
+  assert(cam_ori.dtype() == torch::kFloat32);
+  assert(cam_ori.numel() == 3);
+  assert(cam_dir.dtype() == torch::kFloat32);
+  assert(cam_dir.numel() == 3);
+  assert(cam_up.dtype() == torch::kFloat32);
+  assert(cam_up.numel() == 3);
+  assert(img_dims.size() == 2);
+  RVIP_Params p;
+  // Calculate camera rays
+  const torch::Tensor cam_ori_c = cam_ori.cpu();
+  const torch::Tensor cam_dir_c = cam_dir.cpu();
+  const torch::Tensor cam_up_c = cam_up.cpu();
+  // Get the coordinate frame of camera space in world space
+  normalize<float, 3>(p.cam_fwd, cam_dir_c.data_ptr<float>());
+  cross<float>(p.cam_side, p.cam_fwd, cam_up_c.data_ptr<float>());
+  normalize<float, 3>(p.cam_side);
+  cross<float>(p.cam_up, p.cam_side, p.cam_fwd);
+  normalize<float, 3>(p.cam_up); // Not absolutely necessary as both vectors are
+                                 // normalized. But just in case...
+  copyarr<float, 3>(p.cam_ori, cam_ori_c.data_ptr<float>());
+  p.cam_f = cam_f;
+  p.cam_c[0] = cam_c[0];
+  p.cam_c[1] = cam_c[1];
+  p.max_samples = max_samples;
+  p.voxel_dims[0] = in_voxel.size(0);
+  p.voxel_dims[1] = in_voxel.size(1);
+  p.voxel_dims[2] = in_voxel.size(2);
+  p.voxel_strides[0] = in_voxel.stride(0);
+  p.voxel_strides[1] = in_voxel.stride(1);
+  p.voxel_strides[2] = in_voxel.stride(2);
+  p.img_dims[0] = img_dims[0];
+  p.img_dims[1] = img_dims[1];
+  // Create output tensors
+  // For Minecraft Seg Mask
+  torch::Tensor out_voxel_id =
+      torch::empty({p.img_dims[0], p.img_dims[1], p.max_samples, 1},
+                   torch::TensorOptions().dtype(torch::kInt32).device(device));
+  torch::Tensor out_depth;
+  // Produce two sets of localcoords, one for entry point, the other one for
+  // exit point. They share the same corner_ids.
+  out_depth = torch::empty(
+      {2, p.img_dims[0], p.img_dims[1], p.max_samples, 1},
+      torch::TensorOptions().dtype(torch::kFloat32).device(device));
+  torch::Tensor out_raydirs = torch::empty({p.img_dims[0], p.img_dims[1], 1, 3},
+                                           torch::TensorOptions()
+                                               .dtype(torch::kFloat32)
+                                               .device(device)
+                                               .requires_grad(false));
+  const int TILE_DIM = 8;
+  dim3 dimGrid((p.img_dims[1] + TILE_DIM - 1) / TILE_DIM,
+               (p.img_dims[0] + TILE_DIM - 1) / TILE_DIM, 1);
+  dim3 dimBlock(TILE_DIM, TILE_DIM, 1);
+  ray_voxel_intersection_perspective_kernel<TILE_DIM>
+      <<<dimGrid, dimBlock, 0, stream>>>(
+          out_voxel_id.data_ptr<int32_t>(), out_depth.data_ptr<float>(),
+          out_raydirs.data_ptr<float>(), in_voxel.data_ptr<int32_t>(), p);
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess) {
+    printf("Error in ray_voxel_intersection_perspective_kernel: %s\n",
+           cudaGetErrorString(err));
+  }
+  return {out_voxel_id, out_depth, out_raydirs};
+}

gaussiancity/extensions/voxlib/setup.py ADDED Viewed

	@@ -0,0 +1,32 @@

+# -*- coding: utf-8 -*-
+#
+# @File:   setup.py
+# @Author: NVIDIA Corporation
+# @Date:   2021-10-13 00:00:00
+# @Last Modified by: Haozhe Xie
+# @Last Modified at: 2024-10-13 03:00:47
+# @Email:  root@haozhexie.com
+from setuptools import setup
+from torch.utils.cpp_extension import BuildExtension, CUDAExtension
+cxx_args = ["-fopenmp"]
+nvcc_args = []
+setup(
+    name="voxlib_ext",
+    version="3.0.0",
+    ext_modules=[
+        CUDAExtension(
+            "voxlib",
+            [
+                "bindings.cpp",
+                "ray_voxel_intersection.cu",
+                "points_to_volume.cu",
+                "maps_to_volume.cu",
+            ],
+            extra_compile_args={"cxx": cxx_args, "nvcc": nvcc_args},
+        )
+    ],
+    cmdclass={"build_ext": BuildExtension},
+)

gaussiancity/extensions/voxlib/voxlib_common.h ADDED Viewed

	@@ -0,0 +1,83 @@

+// Copyright (C) 2021 NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+//
+// This work is made available under the Nvidia Source Code License-NC.
+// To view a copy of this license, check out LICENSE.md
+#ifndef VOXLIB_COMMON_H
+#define VOXLIB_COMMON_H
+#define CHECK_CUDA(x) TORCH_CHECK(x.is_cuda(), #x " must be a CUDA tensor")
+#define CHECK_CONTIGUOUS(x)                                                    \
+  TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
+#define CHECK_INPUT(x)                                                         \
+  CHECK_CUDA(x);                                                               \
+  CHECK_CONTIGUOUS(x)
+#define CHECK_CPU(x)                                                           \
+  TORCH_CHECK(x.device().is_cpu(), #x " must be a CPU tensor")
+#include <cuda.h>
+#include <cuda_runtime.h>
+// CUDA vector math functions
+__host__ __device__ __forceinline__ int floor_div(int a, int b) {
+  int c = a / b;
+  if (c * b > a) {
+    c--;
+  }
+  return c;
+}
+template <typename scalar_t>
+__host__ __forceinline__ void cross(scalar_t *r, const scalar_t *a,
+                                    const scalar_t *b) {
+  r[0] = a[1] * b[2] - a[2] * b[1];
+  r[1] = a[2] * b[0] - a[0] * b[2];
+  r[2] = a[0] * b[1] - a[1] * b[0];
+}
+__device__ __host__ __forceinline__ float dot(const float *a, const float *b) {
+  return a[0] * b[0] + a[1] * b[1] + a[2] * b[2];
+}
+template <typename scalar_t, int ndim>
+__device__ __host__ __forceinline__ void copyarr(scalar_t *r,
+                                                 const scalar_t *a) {
+#pragma unroll
+  for (int i = 0; i < ndim; i++) {
+    r[i] = a[i];
+  }
+}
+// TODO: use rsqrt to speed up
+// inplace version
+template <typename scalar_t, int ndim>
+__device__ __host__ __forceinline__ void normalize(scalar_t *a) {
+  scalar_t vec_len = 0.0f;
+#pragma unroll
+  for (int i = 0; i < ndim; i++) {
+    vec_len += a[i] * a[i];
+  }
+  vec_len = sqrtf(vec_len);
+#pragma unroll
+  for (int i = 0; i < ndim; i++) {
+    a[i] /= vec_len;
+  }
+}
+// normalize + copy
+template <typename scalar_t, int ndim>
+__device__ __host__ __forceinline__ void normalize(scalar_t *r,
+                                                   const scalar_t *a) {
+  scalar_t vec_len = 0.0f;
+#pragma unroll
+  for (int i = 0; i < ndim; i++) {
+    vec_len += a[i] * a[i];
+  }
+  vec_len = sqrtf(vec_len);
+#pragma unroll
+  for (int i = 0; i < ndim; i++) {
+    r[i] = a[i] / vec_len;
+  }
+}
+#endif // VOXLIB_COMMON_H

gaussiancity/generator.py ADDED Viewed

	@@ -0,0 +1,536 @@

+# -*- coding: utf-8 -*-
+#
+# @File:   generator.py
+# @Author: Haozhe Xie
+# @Date:   2024-03-09 20:36:52
+# @Last Modified by: Haozhe Xie
+# @Last Modified at: 2024-09-23 20:49:35
+# @Email:  root@haozhexie.com
+import numpy as np
+import torch
+import torch.nn.functional as F
+import extensions.grid_encoder
+import gaussiancity.pt_v3
+class Generator(torch.nn.Module):
+    def __init__(self, cfg, n_classes, proj_size):
+        super(Generator, self).__init__()
+        self.cfg = cfg
+        self.n_classes = n_classes
+        if cfg.ENCODER == "GLOBAL":
+            self.proj_encoder = GlobalEncoder(
+                n_classes, cfg.GLOBAL_ENCODER_N_BLOCKS, cfg.ENCODER_OUT_DIM - 3
+            )
+        elif cfg.ENCODER == "LOCAL":
+            self.proj_encoder = LocalEncoder(n_classes, cfg.ENCODER_OUT_DIM - 3)
+        elif cfg.ENCODER is None:
+            self.proj_encoder = None
+            assert cfg.ENCODER_OUT_DIM == 3
+        else:
+            raise ValueError("Unknown encoder: %s" % cfg.ENCODER)
+        if cfg.POS_EMD == "HASH_GRID":
+            pt_feat_dim = cfg.HASH_GRID_N_LEVELS * cfg.HASH_GRID_LEVEL_DIM
+            self.pos_encoder = extensions.grid_encoder.GridEncoder(
+                in_channels=cfg.ENCODER_OUT_DIM,
+                desired_resolution=proj_size,
+                n_levels=cfg.HASH_GRID_N_LEVELS,
+                lvl_channels=cfg.HASH_GRID_LEVEL_DIM,
+            )
+        elif cfg.POS_EMD == "SIN_COS":
+            pt_feat_dim = 2 * cfg.ENCODER_OUT_DIM * cfg.SIN_COS_FREQ_BENDS
+            self.pos_encoder = SinCosEncoder(cfg.SIN_COS_FREQ_BENDS)
+        else:
+            raise ValueError("Unknown positional encoder: %s" % cfg.POS_EMD)
+        if cfg.PTV3.ENABLED:
+            self.pt_net = gaussiancity.pt_v3.PointTransformerV3(
+                in_channels=pt_feat_dim,
+                order=cfg.PTV3.ORDER,
+                stride=cfg.PTV3.STRIDE,
+                enc_depths=cfg.PTV3.ENC_DEPTHS,
+                enc_channels=cfg.PTV3.ENC_CHANNELS,
+                enc_num_head=cfg.PTV3.ENC_N_HEAD,
+                enc_patch_size=cfg.PTV3.ENC_PATCH_SIZE,
+                dec_depths=cfg.PTV3.DEC_DEPTHS,
+                dec_channels=cfg.PTV3.DEC_CHANNELS,
+                dec_num_head=cfg.PTV3.DEC_N_HEAD,
+                dec_patch_size=cfg.PTV3.DEC_PATCH_SIZE,
+                enable_flash=cfg.PTV3.ENABLE_FLASH_ATTN,
+            )
+            pt_feat_dim += cfg.PTV3.DEC_CHANNELS[0]
+        else:
+            self.pt_net = None
+        self.ga_mlp = GaussianAttrMLP(
+            n_classes,
+            pt_feat_dim,
+            cfg.Z_DIM,
+            cfg.MLP_HIDDEN_DIM,
+            cfg.MLP_N_SHARED_LAYERS,
+            cfg.ATTR_FACTORS,
+            cfg.ATTR_N_LAYERS,
+        )
+    def forward(self, proj_uv, rel_xyz, batch_idx, onehots, z, proj_hf, proj_seg):
+        # Ref: https://github.com/hzxie/CityDreamer/blob/master/models/gancraft.py#L381
+        if self.cfg.ENCODER == "GLOBAL":
+            proj_feat = self.proj_encoder(proj_hf, proj_seg)
+            pt_feat = proj_feat.unsqueeze(dim=1).repeat(1, proj_uv.size(1), 1)
+        elif self.cfg.ENCODER == "LOCAL":
+            proj_feat = self.proj_encoder(proj_hf, proj_seg)
+            pt_feat = (
+                F.grid_sample(proj_feat, proj_uv.unsqueeze(dim=1), align_corners=True)
+                .squeeze(dim=2)
+                .permute(0, 2, 1)
+            )
+        elif self.cfg.ENCODER is None:
+            pt_feat = torch.empty(
+                rel_xyz.size(0), rel_xyz.size(1), 0, device=proj_uv.device
+            )
+        # print(pt_feat.size())  # torch.Size([B, n_pts, cfg.ENCODER_OUT_DIM - 3])
+        pt_feat = torch.cat([pt_feat, rel_xyz], dim=2)
+        # print(pt_feat.size())  # torch.Size([B, n_pts, cfg.ENCODER_OUT_DIM])
+        pt_feat1 = self.pos_encoder(pt_feat)
+        # print(pt_feat1.size())  # torch.Size([B, n_pts, pt_feat_dim])
+        if self.pt_net is None:
+            pt_feat2 = torch.empty(
+                rel_xyz.size(0), rel_xyz.size(1), 0, device=proj_uv.device
+            )
+        else:
+            pt_feat2 = self.pt_net(batch_idx, pt_feat1, rel_xyz)
+        # print(pt_feat2.size())  # torch.Size([B, n_pts, pt_feat_dim])
+        return self.ga_mlp(torch.cat([pt_feat1, pt_feat2], dim=-1), onehots, z)
+class GlobalEncoder(torch.nn.Module):
+    def __init__(self, n_classes, n_blocks, out_channels):
+        super(GlobalEncoder, self).__init__()
+        self.hf_conv = torch.nn.Conv2d(1, 8, kernel_size=3, stride=2, padding=1)
+        self.seg_conv = torch.nn.Conv2d(
+            n_classes,
+            8,
+            kernel_size=3,
+            stride=2,
+            padding=1,
+        )
+        conv_blocks = []
+        cur_hidden_channels = 16
+        for _ in range(1, n_blocks):
+            conv_blocks.append(
+                SRTConvBlock(in_channels=cur_hidden_channels, out_channels=None)
+            )
+            cur_hidden_channels *= 2
+        self.conv_blocks = torch.nn.Sequential(*conv_blocks)
+        self.fc1 = torch.nn.Linear(cur_hidden_channels, 16)
+        self.fc2 = torch.nn.Linear(16, out_channels)
+        self.act = torch.nn.LeakyReLU(0.2)
+    def forward(self, proj_hf, proj_seg):
+        hf = self.act(self.hf_conv(proj_hf))
+        seg = self.act(self.seg_conv(proj_seg))
+        out = torch.cat([hf, seg], dim=1)
+        for layer in self.conv_blocks:
+            out = self.act(layer(out))
+        out = out.permute(0, 2, 3, 1)
+        out = torch.mean(out.reshape(out.shape[0], -1, out.shape[-1]), dim=1)
+        cond = self.act(self.fc1(out))
+        cond = torch.tanh(self.fc2(cond))
+        return cond
+class LocalEncoder(torch.nn.Module):
+    def __init__(self, n_classes, out_channels):
+        super(LocalEncoder, self).__init__()
+        self.hf_conv = torch.nn.Conv2d(1, 32, kernel_size=7, stride=2, padding=3)
+        self.seg_conv = torch.nn.Conv2d(
+            n_classes, 32, kernel_size=7, stride=2, padding=3
+        )
+        self.bn1 = torch.nn.GroupNorm(32, 64)
+        self.conv2 = ResConvBlock(64, 128)
+        self.conv3 = ResConvBlock(128, 256)
+        self.conv4 = ResConvBlock(256, 512)
+        self.dconv5 = torch.nn.ConvTranspose2d(
+            512, 128, kernel_size=4, stride=2, padding=1
+        )
+        self.dconv6 = torch.nn.ConvTranspose2d(
+            128, 32, kernel_size=4, stride=2, padding=1
+        )
+        self.dconv7 = torch.nn.Conv2d(32, out_channels, kernel_size=1)
+    def forward(self, proj_hf, proj_seg):
+        hf = self.hf_conv(proj_hf)
+        seg = self.seg_conv(proj_seg)
+        out = F.relu(self.bn1(torch.cat([hf, seg], dim=1)), inplace=True)
+        # print(out.size())   # torch.Size([N, 64, H/2, W/2])
+        out = F.avg_pool2d(self.conv2(out), 2, stride=2)
+        # print(out.size())   # torch.Size([N, 128, H/4, W/4])
+        out = self.conv3(out)
+        # print(out.size())   # torch.Size([N, 256, H/4, W/4])
+        out = self.conv4(out)
+        # print(out.size())   # torch.Size([N, 512, H/4, W/4])
+        out = self.dconv5(out)
+        # print(out.size())   # torch.Size([N, 128, H/2, W/2])
+        out = self.dconv6(out)
+        # print(out.size())   # torch.Size([N, 32, H, W])
+        out = self.dconv7(out)
+        # print(out.size())   # torch.Size([N, OUT_DIM - 1, H, W])
+        return torch.tanh(out)
+class SRTConvBlock(torch.nn.Module):
+    def __init__(self, in_channels, hidden_channels=None, out_channels=None):
+        super(SRTConvBlock, self).__init__()
+        if hidden_channels is None:
+            hidden_channels = in_channels
+        if out_channels is None:
+            out_channels = 2 * hidden_channels
+        self.layers = torch.nn.Sequential(
+            torch.nn.Conv2d(
+                in_channels,
+                hidden_channels,
+                stride=1,
+                kernel_size=3,
+                padding=1,
+                bias=False,
+            ),
+            torch.nn.ReLU(),
+            torch.nn.Conv2d(
+                hidden_channels,
+                out_channels,
+                stride=2,
+                kernel_size=3,
+                padding=1,
+                bias=False,
+            ),
+            torch.nn.ReLU(),
+        )
+    def forward(self, x):
+        return self.layers(x)
+class ResConvBlock(torch.nn.Module):
+    def __init__(self, in_channels, out_channels, bias=False):
+        super(ResConvBlock, self).__init__()
+        # conv3x3(in_planes, int(out_planes / 2))
+        self.conv1 = torch.nn.Conv2d(
+            in_channels,
+            out_channels // 2,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            bias=bias,
+        )
+        # conv3x3(int(out_planes / 2), int(out_planes / 4))
+        self.conv2 = torch.nn.Conv2d(
+            out_channels // 2,
+            out_channels // 4,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            bias=bias,
+        )
+        # conv3x3(int(out_planes / 4), int(out_planes / 4))
+        self.conv3 = torch.nn.Conv2d(
+            out_channels // 4,
+            out_channels // 4,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            bias=bias,
+        )
+        self.bn1 = torch.nn.GroupNorm(32, in_channels)
+        self.bn2 = torch.nn.GroupNorm(32, out_channels // 2)
+        self.bn3 = torch.nn.GroupNorm(32, out_channels // 4)
+        self.bn4 = torch.nn.GroupNorm(32, in_channels)
+        if in_channels != out_channels:
+            self.downsample = torch.nn.Sequential(
+                self.bn4,
+                torch.nn.ReLU(True),
+                torch.nn.Conv2d(
+                    in_channels, out_channels, kernel_size=1, stride=1, bias=False
+                ),
+            )
+        else:
+            self.downsample = None
+    def forward(self, x):
+        residual = x
+        # print(residual.size())      # torch.Size([N, 64, H, W])
+        out1 = self.bn1(x)
+        out1 = F.relu(out1, True)
+        out1 = self.conv1(out1)
+        # print(out1.size())          # torch.Size([N, 64, H, W])
+        out2 = self.bn2(out1)
+        out2 = F.relu(out2, True)
+        out2 = self.conv2(out2)
+        # print(out2.size())          # torch.Size([N, 32, H, W])
+        out3 = self.bn3(out2)
+        out3 = F.relu(out3, True)
+        out3 = self.conv3(out3)
+        # print(out3.size())          # torch.Size([N, 32, H, W])
+        out3 = torch.cat((out1, out2, out3), dim=1)
+        # print(out3.size())          # torch.Size([N, 128, H, W])
+        if self.downsample is not None:
+            residual = self.downsample(residual)
+            # print(residual.size())  # torch.Size([N, 128, H, W])
+        out3 += residual
+        return out3
+class SinCosEncoder(torch.nn.Module):
+    def __init__(self, n_freq_bands=8):
+        super(SinCosEncoder, self).__init__()
+        self.freq_bands = 2.0 ** torch.linspace(
+            0,
+            n_freq_bands - 1,
+            steps=n_freq_bands,
+        )
+    def forward(self, features):
+        cord_sin = torch.cat(
+            [torch.sin(features * fb) for fb in self.freq_bands], dim=-1
+        )
+        cord_cos = torch.cat(
+            [torch.cos(features * fb) for fb in self.freq_bands], dim=-1
+        )
+        return torch.cat([cord_sin, cord_cos], dim=-1)
+class GaussianAttrMLP(torch.nn.Module):
+    r"""MLP with affine modulation."""
+    def __init__(
+        self,
+        n_classes,
+        in_dim,
+        z_dim,
+        hidden_dim,
+        n_shared_layers,
+        factors={},
+        n_layers={},
+    ):
+        super(GaussianAttrMLP, self).__init__()
+        self.factors = factors
+        self.n_layers = n_layers
+        self.n_shared_layers = n_shared_layers
+        self.act = torch.nn.LeakyReLU(negative_slope=0.2)
+        self.fc_m_a = torch.nn.Linear(
+            n_classes,
+            hidden_dim,
+            bias=False,
+        )
+        self.fc_1 = torch.nn.Linear(
+            in_dim,
+            hidden_dim,
+        )
+        for i in range(2, n_shared_layers + 1):
+            setattr(
+                self,
+                "fc_%d" % i,
+                (
+                    ModLinear(
+                        hidden_dim,
+                        hidden_dim,
+                        z_dim,
+                        bias=False,
+                        mod_bias=True,
+                        output_mode=True,
+                    )
+                    if z_dim is not None
+                    else torch.nn.Linear(hidden_dim, hidden_dim)
+                ),
+            )
+        for k in factors.keys():
+            assert k in ["xyz", "rgb", "scale", "opacity"], "Unknwon key: %s" % k
+            for i in range(n_layers[k]):
+                setattr(
+                    self,
+                    "fc_%d_%s_%d" % (n_shared_layers + 1, k, i),
+                    (
+                        ModLinear(
+                            hidden_dim,
+                            hidden_dim,
+                            z_dim,
+                            bias=False,
+                            mod_bias=True,
+                            output_mode=True,
+                        )
+                        if z_dim is not None
+                        else torch.nn.Linear(hidden_dim, hidden_dim)
+                    ),
+                )
+            setattr(
+                self,
+                "fc_out_%s" % k,
+                torch.nn.Linear(
+                    hidden_dim,
+                    1 if k == "opacity" else 3,
+                ),
+            )
+    def forward(self, pt_feat, onehots, zs):
+        b, n, _ = pt_feat.size()
+        f = self.fc_1(pt_feat)
+        f = f + self.fc_m_a(onehots)
+        f = self.act(f)
+        if zs is None:
+            output = self._instance_forward(f)
+        else:
+            output = {
+                k: torch.zeros(b, n, 1 if k == "opacity" else 3, device=pt_feat.device)
+                for k in self.factors.keys()
+            }
+            for v in zs.values():
+                z = v["z"]
+                idx = v["idx"]
+                _output = self._instance_forward(f[idx].unsqueeze(dim=0), z)
+                for k, v in _output.items():
+                    output[k][idx] = v
+        return output
+    def _instance_forward(self, f, z=None):
+        for i in range(2, self.n_shared_layers + 1):
+            fc = getattr(self, "fc_%d" % i)
+            f = self.act(fc(f, z) if z is not None else fc(f))
+        output = {}
+        for k in self.factors.keys():
+            _f = f.clone()
+            for i in range(self.n_layers[k]):
+                _fc = getattr(self, "fc_%d_%s_%d" % (self.n_shared_layers + 1, k, i))
+                _f = self.act(_fc(_f, z) if z is not None else _fc(f))
+            fc_out = getattr(self, "fc_out_%s" % k)
+            output[k] = fc_out(_f)
+        if "xyz" in self.factors:
+            output["xyz"] = (torch.sigmoid(output["xyz"]) - 0.5) * self.factors["xyz"]
+        if "rgb" in self.factors:
+            output["rgb"] = (torch.sigmoid(output["rgb"]) - 0.5) * self.factors["rgb"]
+        if "scale" in self.factors:
+            output["scale"] = 1 + output["scale"].clamp(-1, 1) * self.factors["scale"]
+        if "opacity" in self.factors:
+            output["opacity"] = torch.sigmoid(output["opacity"]) * self.factors[
+                "opacity"
+            ] + (1 - self.factors["opacity"])
+        return output
+class ModLinear(torch.nn.Module):
+    r"""Linear layer with affine modulation (Based on StyleGAN2 mod demod).
+    Equivalent to affine modulation following linear, but faster when the same modulation parameters are shared across
+    multiple inputs.
+    Args:
+        in_features (int): Number of input features.
+        out_features (int): Number of output features.
+        style_features (int): Number of style features.
+        bias (bool): Apply additive bias before the activation function?
+        mod_bias (bool): Whether to modulate bias.
+        output_mode (bool): If True, modulate output instead of input.
+        weight_gain (float): Initialization gain
+    """
+    def __init__(
+        self,
+        in_features,
+        out_features,
+        style_features,
+        bias=True,
+        mod_bias=True,
+        output_mode=False,
+        weight_gain=1,
+        bias_init=0,
+    ):
+        super(ModLinear, self).__init__()
+        weight_gain = weight_gain / np.sqrt(in_features)
+        self.weight = torch.nn.Parameter(
+            torch.randn([out_features, in_features]) * weight_gain
+        )
+        self.bias = (
+            torch.nn.Parameter(torch.full([out_features], np.float32(bias_init)))
+            if bias
+            else None
+        )
+        self.weight_alpha = torch.nn.Parameter(
+            torch.randn([in_features, style_features]) / np.sqrt(style_features)
+        )
+        self.bias_alpha = torch.nn.Parameter(
+            torch.full([in_features], 1, dtype=torch.float)
+        )  # init to 1
+        self.weight_beta = None
+        self.bias_beta = None
+        self.mod_bias = mod_bias
+        self.output_mode = output_mode
+        if mod_bias:
+            if output_mode:
+                mod_bias_dims = out_features
+            else:
+                mod_bias_dims = in_features
+            self.weight_beta = torch.nn.Parameter(
+                torch.randn([mod_bias_dims, style_features]) / np.sqrt(style_features)
+            )
+            self.bias_beta = torch.nn.Parameter(
+                torch.full([mod_bias_dims], 0, dtype=torch.float)
+            )
+    @staticmethod
+    def _linear_f(x, w, b):
+        w = w.to(x.dtype)
+        x_shape = x.shape
+        x = x.reshape(-1, x_shape[-1])
+        if b is not None:
+            b = b.to(x.dtype)
+            x = torch.addmm(b.unsqueeze(0), x, w.t())
+        else:
+            x = x.matmul(w.t())
+        x = x.reshape(*x_shape[:-1], -1)
+        return x
+    # x: B, ...   , Cin
+    # z: B, ...   , Cz
+    def forward(self, x, z):
+        x_shape = x.shape
+        z_shape = z.shape
+        x = x.reshape(x_shape[0], -1, x_shape[-1])
+        z = z.reshape(z_shape[0], -1, z_shape[-1])
+        alpha = self._linear_f(z, self.weight_alpha, self.bias_alpha)  # [B, ..., I]
+        w = self.weight.to(x.dtype)  # [O I]
+        w = w.unsqueeze(0) * alpha
+        if self.mod_bias:
+            beta = self._linear_f(z, self.weight_beta, self.bias_beta)  # [B, ..., I]
+            if not self.output_mode:
+                x = x + beta
+        b = self.bias
+        if b is not None:
+            b = b.to(x.dtype)[None, None, :]
+        if self.mod_bias and self.output_mode:
+            if b is None:
+                b = beta
+            else:
+                b = b + beta
+        # [B ? I] @ [B I O] = [B ? O]
+        if b is not None:
+            x = torch.baddbmm(b, x, w.transpose(1, 2))
+        else:
+            x = x.bmm(w.transpose(1, 2))
+        x = x.reshape(*x_shape[:-1], x.shape[-1])
+        return x

gaussiancity/inference.py ADDED Viewed

	@@ -0,0 +1,582 @@

+# -*- coding: utf-8 -*-
+#
+# @File:   inference.py
+# @Author: Haozhe Xie
+# @Date:   2024-03-02 16:30:00
+# @Last Modified by: Haozhe Xie
+# @Last Modified at: 2024-10-13 15:17:20
+# @Email:  root@haozhexie.com
+import cv2
+import math
+import numpy as np
+import scipy.spatial.transform
+import torch
+from tqdm import tqdm
+CLASSES = {
+    "NULL": 0,
+    "ROAD": 1,
+    "BLDG_FACADE": 2,
+    "GREEN_LANDS": 3,
+    "CONSTRUCTION": 4,
+    "COAST_ZONES": 5,
+    "ZONE": 6,
+    "BLDG_ROOF": 7,
+}
+SCALES = {
+    "ROAD": 2,
+    "BLDG_FACADE": 1,
+    "BLDG_ROOF": 1,
+    "GREEN_LANDS": 2,
+    "CONSTRUCTION": 1,
+    "COAST_ZONES": 4,
+    "ZONE": 2,
+}
+CONSTANTS = {
+    "CAM_K": [1528.1469407006614, 0, 480, 0, 1528.1469407006614, 270, 0, 0, 1],
+    "SENSOR_SIZE": [960, 540],
+    "BLDG_INST_RANGE": [100, 16384],
+    "PROJECTION_SIZE": 2048,
+    "POINT_SCALE_FACTOR": 0.5,
+    "SPECIAL_Z_SCALE_CLASSES": [
+        CLASSES["ROAD"],
+        CLASSES["COAST_ZONES"],
+        CLASSES["ZONE"],
+    ],
+}
+def get_instance_seg_map(seg_map):
+    # Mapping constructions to buildings
+    seg_map[seg_map == CLASSES["CONSTRUCTION"]] = CLASSES["BLDG_FACADE"]
+    # Use connected components to get building instances
+    import pdb; pdb.set_trace()
+    _, labels, _, _ = cv2.connectedComponentsWithStats(
+        (seg_map == CLASSES["BLDG_FACADE"]).astype(np.uint8), connectivity=4
+    )
+    # Remove non-building instance masks
+    labels[seg_map != CLASSES["BLDG_FACADE"]] = 0
+    # Building instance mask
+    building_mask = labels != 0
+    # Make building instance IDs are even numbers and start from 10
+    # Assume the ID of a facade instance is 2k, the corresponding roof instance is 2k - 1.
+    labels = (labels + CONSTANTS["BLDG_INST_RANGE"][0]) * 2
+    seg_map[seg_map == CLASSES["BLDG_FACADE"]] = 0
+    seg_map = seg_map * (1 - building_mask) + labels * building_mask
+    assert np.max(labels) < 2147483648
+    return seg_map.astype(np.int32)
+def get_point_map(seg_map):
+    inverted_index = {v: k for k, v in CLASSES.items()}
+    pts_map = np.zeros(seg_map.shape, dtype=bool)
+    for c in np.unique(seg_map):
+        cls_name = inverted_index[c]
+        if cls_name == "NULL":
+            continue
+        mask = seg_map == c
+        pt_map = _get_point_map(seg_map.shape, SCALES[cls_name])
+        pt_map[~mask] = False
+        pts_map += pt_map
+    return pts_map
+def _get_point_map(map_size, stride):
+    pts_map = np.zeros(map_size, dtype=bool)
+    ys = np.arange(0, map_size[0], stride)
+    xs = np.arange(0, map_size[1], stride)
+    coords = np.stack(np.meshgrid(ys, xs), axis=-1).reshape(-1, 2)
+    pts_map[coords[:, 0], coords[:, 1]] = True
+    return pts_map
+def get_centers(ins_map, td_hf):
+    centers = {}
+    instances = np.unique(ins_map)
+    for i in tqdm(instances, desc="Calculating centers ..."):
+        if i >= CONSTANTS["BLDG_INST_RANGE"][0]:
+            ds_mask = ins_map == i
+            contours, _ = cv2.findContours(
+                ds_mask.astype(np.uint8),
+                cv2.RETR_EXTERNAL,
+                cv2.CHAIN_APPROX_SIMPLE,
+            )
+            contours = np.vstack(contours).reshape(-1, 2)
+            min_x, max_x = np.min(contours[:, 0]), np.max(contours[:, 0])
+            min_y, max_y = np.min(contours[:, 1]), np.max(contours[:, 1])
+            max_z = np.max(td_hf[ds_mask]) + 1
+        else:
+            min_x, max_x = 0, CONSTANTS["PROJECTION_SIZE"]
+            min_y, max_y = 0, CONSTANTS["PROJECTION_SIZE"]
+            max_z = np.max(td_hf)
+        centers[i] = np.array(
+            [
+                (min_x + max_x) / 2,
+                (min_y + max_y) / 2,
+                (max_x - min_x),
+                (max_y - min_y),
+                max_z,
+            ],
+            dtype=np.float32,
+        )
+    return centers
+def generate_city(
+    fgm, bgm, city_layout, cx, cy, radius, altitude, azimuth, style_lut=None
+):
+    import gaussiancity.extensions.diff_gaussian_rasterization as dgr
+    device = torch.device("cuda")
+    gr = dgr.GaussianRasterizerWrapper(
+        np.array(CONSTANTS["CAM_K"], dtype=np.float32).reshape((3, 3)),
+        CONSTANTS["SENSOR_SIZE"],
+        flip_lr=True,
+        flip_ud=False,
+        device=device,
+    )
+    layout = _get_local_layout(
+        city_layout,
+        cx,
+        cy,
+        CONSTANTS["PROJECTION_SIZE"] // 2,
+        CONSTANTS["BLDG_INST_RANGE"],
+        device,
+    )
+    bev_pts = _get_bev_points(layout, SCALES, CLASSES)
+    bev_pt_classes = _instances_to_classes(
+        bev_pts[:, [3]], CONSTANTS["BLDG_INST_RANGE"], CLASSES
+    )
+    bev_pt_classes_onehot = _get_onehot_seg(bev_pt_classes, len(CLASSES))
+    bev_pt_scales = _get_point_scales(
+        bev_pt_classes,
+        SCALES,
+        CLASSES,
+        CONSTANTS["SPECIAL_Z_SCALE_CLASSES"],
+    )
+    bev_pts = torch.cat([bev_pts, bev_pt_scales, bev_pt_classes_onehot], dim=1)
+    # print(bev_pts.shape)  # [N, XYZ + Inst + Scale3D + N_CLASSES]
+    if style_lut is None:
+        style_lut = _get_style_lut(
+            layout["CTR"],
+            {"BLDG": fgm, "REST": bgm},
+            {
+                "BLDG": CONSTANTS["BLDG_INST_RANGE"],
+                "REST": [0, CONSTANTS["BLDG_INST_RANGE"][0]],
+            },
+            device,
+        )
+    cam_look_at, cam_pose = _get_orbit_camera_pose(
+        radius, altitude, azimuth, CONSTANTS["PROJECTION_SIZE"] // 2, device
+    )
+    vp_idx = _get_visible_points(
+        bev_pts[:, :3],
+        bev_pt_scales,
+        CONSTANTS["CAM_K"],
+        CONSTANTS["SENSOR_SIZE"],
+        cam_pose[:3],
+        cam_look_at,
+    )
+    gs_attrs = _get_gs_attrs(
+        bev_pts[vp_idx],
+        layout["TD_HF"].float(),
+        layout["SEG"].float(),
+        style_lut,
+        layout["CTR"],
+        {"BLDG": fgm, "REST": bgm},
+        CONSTANTS["POINT_SCALE_FACTOR"],
+        CONSTANTS["BLDG_INST_RANGE"],
+    )
+    return _render(gs_attrs, gr, cam_pose)
+def _get_local_layout(city_layout, cx, cy, half_proj_size, bldg_inst_range, device):
+    x_min, x_max = cx - half_proj_size, cx + half_proj_size
+    y_min, y_max = cy - half_proj_size, cy + half_proj_size
+    _layout = {
+        k: torch.from_numpy(v[None, None, y_min:y_max, x_min:x_max]).cuda(device)
+        for k, v in city_layout.items()
+        if k in ["TD_HF", "BU_HF", "SEG", "INS", "PTS"]
+    }
+    _layout["SEG"] = _get_onehot_seg(_layout["SEG"], len(CLASSES))
+    _instances = torch.unique(_layout["INS"])
+    _centers = {}
+    for inst in _instances:
+        inst = inst.item()
+        if inst >= bldg_inst_range[0]:
+            _centers[inst] = torch.from_numpy(city_layout["CTR"][inst]).cuda(device)
+            _centers[inst][0] -= x_min
+            _centers[inst][1] -= y_min
+            _centers[inst + 1] = _centers[inst]  # Fix the centers for BLDG_ROOF
+        else:
+            _centers[inst] = torch.from_numpy(city_layout["CTR"][inst]).cuda(device)
+            _centers[inst][0] = x_min
+            _centers[inst][1] = y_min
+    _layout["CTR"] = _centers
+    return _layout
+def _get_onehot_seg(seg_map, n_classes):
+    shape = seg_map.shape
+    # shape -> NxCxHxW or NxC
+    # assert shape[1] == 1
+    output_shape = (shape[0], n_classes, *shape[2:])
+    one_hot_masks = torch.zeros(output_shape, device=seg_map.device, dtype=torch.bool)
+    for i in range(n_classes):
+        one_hot_masks[:, [i]] = seg_map == i
+    return one_hot_masks
+def _get_style_lut(centers, models, inst_ranges, device, z_dim=256):
+    lut = {ins: torch.rand(1, z_dim, device=device) for ins in centers.keys()}
+    for k, v in models.items():
+        if v is None:
+            continue
+        if v.module.cfg.Z_DIM is None:
+            for i in range(*inst_ranges[k]):
+                if i in lut:
+                    del lut[i]
+            continue
+        if hasattr(v.module, "z"):
+            zs = v.module.z
+            lut.update(
+                {
+                    ins: zs[np.random.choice(list(zs.keys()))].unsqueeze(0)
+                    for ins in centers.keys()
+                }
+            )
+    return lut
+def _get_orbit_camera_pose(radius, altitude, azimuth, half_proj_size, device):
+    cx, cy = half_proj_size, half_proj_size
+    theta = np.deg2rad(azimuth)
+    cam_x = cx + radius * math.cos(theta)
+    cam_y = cy + radius * math.sin(theta)
+    cam_pos = np.array([cam_x, cam_y, altitude], dtype=np.float32)
+    cam_look_at = np.array([cx, cy, 1], dtype=np.float32)
+    quat = _get_quat_from_look_at(cam_pos, cam_look_at)
+    return torch.tensor([*cam_look_at], device=device), torch.tensor(
+        [*cam_pos, *quat], device=device
+    )
+def _get_quat_from_look_at(cam_pos, cam_look_at):
+    fwd_vec = cam_look_at - cam_pos
+    fwd_vec /= np.linalg.norm(fwd_vec)
+    up_vec = np.array([0, 0, 1])
+    right_vec = np.cross(up_vec, fwd_vec)
+    right_vec /= np.linalg.norm(right_vec)
+    up_vec = np.cross(fwd_vec, right_vec)
+    R = np.stack([fwd_vec, right_vec, up_vec], axis=1)
+    return scipy.spatial.transform.Rotation.from_matrix(R).as_quat()
+def _get_bev_points(layout, scales, classes):
+    import gaussiancity.extensions.voxlib
+    assert torch.max(layout["INS"]) < 16384
+    # torch.nonzero(torch.zeros(2048, 2048, 512).cuda())
+    # -> nonzero is not supported for tensors with more than INT_MAX elements
+    # torch.nonzero(torch.zeros(2048, 2048, 508).cuda())
+    # -> an illegal memory access was encountered
+    assert torch.max(layout["TD_HF"]) <= 500
+    volume = gaussiancity.extensions.voxlib.maps_to_volume(
+        layout["INS"].squeeze().short(),
+        layout["TD_HF"].squeeze().short(),
+        layout["BU_HF"].squeeze().short(),
+        layout["PTS"].squeeze().bool(),
+        torch.tensor(
+            [scales[k] if k in scales else 0 for k in classes.keys()],
+            dtype=torch.int8,
+            device=layout["INS"].device,
+        ),
+    )
+    non_zero_indices = torch.nonzero(volume, as_tuple=False)
+    non_zero_values = volume[
+        non_zero_indices[:, 0], non_zero_indices[:, 1], non_zero_indices[:, 2]
+    ]
+    return torch.cat(
+        [non_zero_indices.short(), non_zero_values.unsqueeze(dim=1)], dim=1
+    )
+def _instances_to_classes(instances, bldg_inst_range, bldg_classes):
+    bldg_facade_idx = (instances >= bldg_inst_range[0]) & (instances % 2 == 0)
+    bldg_roof_idx = (instances >= bldg_inst_range[0]) & (instances % 2 == 1)
+    classes = instances.clone()
+    classes[bldg_facade_idx] = bldg_classes["BLDG_FACADE"]
+    classes[bldg_roof_idx] = bldg_classes["BLDG_ROOF"]
+    return classes
+def _get_point_scales(pt_classes, scales, classes, special_z_scale_classes=[]):
+    pt_scales = pt_classes.clone()
+    for k, v in scales.items():
+        pt_scales[pt_classes == classes[k]] = v
+    pt_scales_3d = torch.ones_like(pt_scales).repeat(1, 3) * pt_scales
+    # Set the z-scale = 1 for roads, zones, and waters
+    pt_scales_3d[..., 2][
+        torch.isin(
+            pt_classes.squeeze(dim=-1),
+            torch.tensor(
+                list(special_z_scale_classes),
+                device=pt_classes.device,
+            ),
+        )
+    ] = 1
+    return pt_scales_3d
+def _get_visible_points(points, scales, K, sensor_size, cam_pos, cam_look_at):
+    ## NOTE: Each point is assigned with a unique ID. The values in the rendered map
+    ## denotes the visibility of the points. The values are the same as the point IDs.
+    # Generate 3D volume
+    volume, offsets = _get_volume(points, scales)
+    # Ray-voxel intersection
+    vp_map = _get_ray_voxel_intersection(
+        K, sensor_size, cam_pos - offsets, cam_look_at - cam_pos, volume
+    )
+    ## Generate the instance segmentation map as a side product
+    # ins_map = instances[vp_map]
+    # null_mask = vp_map == -1
+    # ins_map[null_mask] = null_class_id
+    # Manually release the memory to avoid OOM
+    del volume
+    torch.cuda.empty_cache()
+    vp_idx = torch.unique(vp_map)
+    return vp_idx[vp_idx >= 0]
+def _get_volume(points, scales):
+    import gaussiancity.extensions.voxlib
+    x_min, x_max = torch.min(points[:, 0]).item(), torch.max(points[:, 0]).item()
+    y_min, y_max = torch.min(points[:, 1]).item(), torch.max(points[:, 1]).item()
+    z_min, z_max = torch.min(points[:, 2]).item(), torch.max(points[:, 2]).item()
+    offsets = torch.tensor(
+        [x_min, y_min, z_min], dtype=torch.int16, device=points.device
+    )
+    # Normalize points coordinates to local coordinate system
+    points = _get_localized_pt_cords(points, offsets)
+    # Generate an empty 3D volume
+    w, h, d = x_max - x_min + 1, y_max - y_min + 1, z_max - z_min + 2
+    # Generate point IDs
+    # NOTE: The point IDs start from 1 to avoid the conflict with the NULL class.
+    assert points.shape[0] < 2147483648
+    pt_ids = torch.arange(
+        start=1, end=points.shape[0] + 1, dtype=torch.int32, device=points.device
+    ).unsqueeze(dim=1)
+    volume = gaussiancity.extensions.voxlib.points_to_volume(
+        points.contiguous(), pt_ids, scales, h, w, d
+    )
+    return volume, offsets
+def _get_localized_pt_cords(points, offsets):
+    points[:, 0] -= offsets[0]
+    points[:, 1] -= offsets[1]
+    points[:, 2] -= offsets[2] - 1
+    return points
+def _get_ray_voxel_intersection(K, sensor_size, cam_origin, viewdir, volume):
+    import gaussiancity.extensions.voxlib
+    N_MAX_SAMPLES = 1
+    voxel_id, _, _ = gaussiancity.extensions.voxlib.ray_voxel_intersection_perspective(
+        volume,
+        cam_origin[[1, 0, 2]].float(),
+        viewdir[[1, 0, 2]].float(),
+        torch.tensor([0, 0, 1], dtype=torch.float32),
+        K[0],
+        [K[5], K[2]],
+        [sensor_size[1], sensor_size[0]],
+        N_MAX_SAMPLES,
+    )
+    # NOTE: The point ID for NULL class is -1, the rest point IDs are from 0 to N - 1.
+    # The ray_voxel_intersection_perspective seems not accepting the negative values.
+    return voxel_id.squeeze() - 1
+def get_hf_seg_tensor(part_hf, part_seg, layout_cfg, output_device):
+    part_hf = torch.from_numpy(part_hf[None, None, ...]).to(output_device)
+    part_seg = torch.from_numpy(part_seg[None, None, ...]).to(output_device)
+    part_hf = part_hf / CONSTANTS["LAYOUT_MAX_HEIGHT"]
+    part_seg = _masks_to_onehots(part_seg[:, 0, :, :], CONSTANTS["LAYOUT_N_CLASSES"])
+    return torch.cat([part_hf, part_seg], dim=1)
+def _masks_to_onehots(masks, n_class, ignored_classes=[]):
+    b, h, w = masks.shape
+    n_class_actual = n_class - len(ignored_classes)
+    one_hot_masks = torch.zeros(
+        (b, n_class_actual, h, w), dtype=torch.float32, device=masks.device
+    )
+    n_class_cnt = 0
+    for i in range(n_class):
+        if i not in ignored_classes:
+            one_hot_masks[:, n_class_cnt] = masks == i
+            n_class_cnt += 1
+    return one_hot_masks
+def _get_gs_attrs(
+    pts,
+    proj_hf,
+    proj_seg,
+    style_lut,
+    centers,
+    models,
+    scale_factor,
+    bldg_inst_range,
+):
+    n_pts, _ = pts.shape
+    # NOTE: 4: XYZ, Instance ID; 3: Scale; N_CLASSES: One-hot
+    # print(pts.shape)  # [N, 4 + 3 + N_CLASSES]
+    bldg_selector = pts[:, 3] >= bldg_inst_range[0]
+    bldg_pts = pts[bldg_selector]
+    rest_pts = pts[~bldg_selector]
+    bldg_attrs = _get_pt_input_attrs(
+        bldg_pts[:, :4],
+        centers,
+        style_lut,
+        models["BLDG"].module.cfg.Z_DIM,
+        bldg_inst_range,
+    )
+    rest_attrs = _get_pt_input_attrs(
+        rest_pts[:, :4],
+        centers,
+        style_lut,
+        models["REST"].module.cfg.Z_DIM,
+        bldg_inst_range,
+    )
+    bldg_colors = _get_gs_colors(
+        bldg_pts, bldg_attrs, proj_hf, proj_seg, models["BLDG"]
+    )
+    rest_colors = _get_gs_colors(
+        rest_pts, rest_attrs, proj_hf, proj_seg, models["REST"]
+    )
+    abs_xyz = torch.cat([bldg_pts[:, :3], rest_pts[:, :3]], dim=0)
+    scales = torch.cat([bldg_pts[:, 4:7], rest_pts[:, 4:7]], dim=0) * scale_factor
+    rgb = torch.cat([bldg_colors, rest_colors], dim=0)
+    # Attributes with default values
+    opacity = torch.ones((n_pts, 1), device=pts.device)
+    rotations = torch.cat(
+        [
+            torch.ones(n_pts, 1, device=pts.device),
+            torch.zeros(n_pts, 3, device=pts.device),
+        ],
+        dim=-1,
+    )
+    return torch.cat((abs_xyz, opacity, scales, rotations, rgb), dim=-1)
+def _get_pt_input_attrs(pts, centers, style_lut, z_dim, bldg_inst_range):
+    n_pts = pts.shape[0]
+    instances = torch.unique(pts[:, -1])
+    rel_xyz = torch.zeros(1, n_pts, 3, dtype=torch.float32, device=pts.device)
+    batch_idx = torch.zeros(1, n_pts, dtype=torch.int32, device=pts.device)
+    zs = {} if z_dim is not None else None
+    for idx, ins in enumerate(instances):
+        ins = ins.item()
+        is_pts = pts[:, -1] == ins
+        cx, cy, w, h, d = centers[ins]
+        if ins >= bldg_inst_range[0]:
+            rel_xyz[:, is_pts, 0] = (pts[is_pts, 0] - cx) / w * 2 if w > 0 else 0
+            rel_xyz[:, is_pts, 1] = (pts[is_pts, 1] - cy) / h * 2 if h > 0 else 0
+        else:
+            # Make the BG contiguous
+            period_x = torch.ceil((pts[is_pts, 0] / w / 2) - 0.5)
+            period_y = torch.ceil((pts[is_pts, 1] / h / 2) - 0.5)
+            rel_xyz[:, is_pts, 0] = (
+                (pts[is_pts, 0] - 2 * period_x * w) * (-1) ** period_x
+            ) / w
+            rel_xyz[:, is_pts, 1] = (
+                (pts[is_pts, 1] - 2 * period_y * h) * (-1) ** period_y
+            ) / h
+        rel_xyz[:, is_pts, 2] = (
+            torch.clip(pts[is_pts, 2] / d * 2 - 1, -1, 1) if d > 0 else 0
+        )
+        batch_idx[:, is_pts] = idx
+        if zs is not None:
+            zs[ins] = {"z": style_lut[ins], "idx": is_pts.unsqueeze(dim=0)}
+    return rel_xyz, batch_idx, zs
+def _get_gs_colors(pts, pt_attrs, proj_hf, proj_seg, model):
+    if pts.shape[0] == 0:
+        return torch.empty(0, 3, dtype=torch.float32, device=pts.device)
+    abs_xyz, onehots = pts[None, :, :3], pts[None, :, 7:]
+    rel_xyz, batch_idx, zs = pt_attrs
+    proj_uv = None
+    if model.module.cfg.ENCODER is not None:
+        proj_uv = get_projection_uv(abs_xyz)
+    with torch.no_grad():
+        # TODO: Optimize the _instance_forward in Generator
+        gs_attrs = model(
+            proj_uv, rel_xyz, batch_idx, onehots.float(), zs, proj_hf, proj_seg
+        )
+    return gs_attrs["rgb"].squeeze(dim=0)
+def get_projection_uv(xyz, proj_tlp=None, proj_size=2048):
+    n_pts = xyz.size(1)
+    if proj_tlp is None:
+        proj_uv = xyz[..., :2].clone().float()
+    else:
+        proj_uv = xyz[..., :2] - proj_tlp.unsqueeze(dim=1)
+    assert proj_uv.size() == (xyz.size(0), n_pts, 2)
+    proj_uv[..., 0] /= proj_size
+    proj_uv[..., 1] /= proj_size
+    # Normalize to [-1, 1]
+    return proj_uv * 2 - 1
+def _render(gs_attrs, rasterizator, cam_pose):
+    import torchvision.transforms.functional as F
+    with torch.no_grad():
+        img = rasterizator(
+            gs_attrs,
+            cam_pose[:3],  # Position
+            cam_pose[3:],  # Quaternion
+        )
+    img = img.squeeze() / 2 + 0.5
+    img = F.adjust_brightness(img, 1.2)
+    img = F.adjust_contrast(img, 1.2)
+    return (img * 255).permute(1, 2, 0).cpu().numpy().astype(np.uint8)

gaussiancity/pt_v3.py ADDED Viewed

	@@ -0,0 +1,1344 @@

+# -*- coding: utf-8 -*-
+#
+# @File:   pt_v3.py
+# @Author: Xiaoyang Wu <xiaoyang.wu.cs@gmail.com>
+# @Date:   2024-04-01 16:31:36
+# @Last Modified by: Haozhe Xie
+# @Last Modified at: 2024-05-15 22:05:09
+# @Email:  root@haozhexie.com
+# Ref:
+# - https://github.com/Pointcept/PointTransformerV3/blob/main/model.py
+# - https://huggingface.co/spaces/Roll20/pet_score/blame/main/lib/timm/models/layers/drop.py
+import addict
+import collections
+import functools
+import flash_attn
+import math
+import torch
+import spconv.pytorch as spconv
+import torch_scatter
+import typing
+@torch.inference_mode()
+def offset2bincount(offset):
+    return torch.diff(
+        offset, prepend=torch.tensor([0], device=offset.device, dtype=torch.long)
+    )
+@torch.inference_mode()
+def offset2batch(offset):
+    bincount = offset2bincount(offset)
+    return torch.arange(
+        len(bincount), device=offset.device, dtype=torch.long
+    ).repeat_interleave(bincount)
+@torch.inference_mode()
+def batch2offset(batch):
+    return torch.cumsum(batch.bincount(), dim=0).long()
+class KeyLUT:
+    def __init__(self):
+        r256 = torch.arange(256, dtype=torch.int64)
+        r512 = torch.arange(512, dtype=torch.int64)
+        zero = torch.zeros(256, dtype=torch.int64)
+        device = torch.device("cpu")
+        self._encode = {
+            device: (
+                self.xyz2key(r256, zero, zero, 8),
+                self.xyz2key(zero, r256, zero, 8),
+                self.xyz2key(zero, zero, r256, 8),
+            )
+        }
+        self._decode = {device: self.key2xyz(r512, 9)}
+    def encode_lut(self, device=torch.device("cpu")):
+        if device not in self._encode:
+            cpu = torch.device("cpu")
+            self._encode[device] = tuple(e.to(device) for e in self._encode[cpu])
+        return self._encode[device]
+    def decode_lut(self, device=torch.device("cpu")):
+        if device not in self._decode:
+            cpu = torch.device("cpu")
+            self._decode[device] = tuple(e.to(device) for e in self._decode[cpu])
+        return self._decode[device]
+    def xyz2key(self, x, y, z, depth):
+        key = torch.zeros_like(x)
+        for i in range(depth):
+            mask = 1 << i
+            key = (
+                key
+                | ((x & mask) << (2 * i + 2))
+                | ((y & mask) << (2 * i + 1))
+                | ((z & mask) << (2 * i + 0))
+            )
+        return key
+    def key2xyz(self, key, depth):
+        x = torch.zeros_like(key)
+        y = torch.zeros_like(key)
+        z = torch.zeros_like(key)
+        for i in range(depth):
+            x = x | ((key & (1 << (3 * i + 2))) >> (2 * i + 2))
+            y = y | ((key & (1 << (3 * i + 1))) >> (2 * i + 1))
+            z = z | ((key & (1 << (3 * i + 0))) >> (2 * i + 0))
+        return x, y, z
+class Serializator:
+    def encode(self, grid_coord, grid_size=0.01, batch=None, depth=16, order="cord"):
+        assert order in {"cord", "z", "z-trans", "hilbert", "hilbert-trans"}
+        if order in ["z", "z-trans"]:
+            self.key_lut = KeyLUT()
+        if order == "cord":
+            code = self.cord_encode(grid_coord, grid_size)
+        elif order == "z":
+            code = self.z_order_encode(grid_coord, depth=depth)
+        elif order == "z-trans":
+            code = self.z_order_encode(grid_coord[:, [1, 0, 2]], depth=depth)
+        elif order == "hilbert":
+            code = self.hilbert_encode(grid_coord, depth=depth)
+        elif order == "hilbert-trans":
+            code = self.hilbert_encode(grid_coord[:, [1, 0, 2]], depth=depth)
+        else:
+            raise NotImplementedError
+        if batch is not None:
+            batch = batch.long()
+            code = batch << depth * 3 | code
+        return code
+    def cord_encode(self, grid_coord: torch.Tensor, grid_size: float):
+        x, y, z = (
+            grid_coord[:, 0].long(),
+            grid_coord[:, 1].long(),
+            grid_coord[:, 2].long(),
+        )
+        # we block the support to batch, maintain batched code in Point class
+        code = x / grid_size**2 + y / grid_size + z
+        return code.long()
+    def z_order_encode(self, grid_coord: torch.Tensor, depth: int = 16):
+        x, y, z = (
+            grid_coord[:, 0].long(),
+            grid_coord[:, 1].long(),
+            grid_coord[:, 2].long(),
+        )
+        # we block the support to batch, maintain batched code in Point class
+        code = self._xyz2key(x, y, z, b=None, depth=depth)
+        return code
+    def _xyz2key(
+        self,
+        x: torch.Tensor,
+        y: torch.Tensor,
+        z: torch.Tensor,
+        b: typing.Optional[typing.Union[torch.Tensor, int]] = None,
+        depth: int = 16,
+    ):
+        r"""Encodes :attr:`x`, :attr:`y`, :attr:`z` coordinates to the shuffled keys
+        based on pre-computed look up tables. The speed of this function is much
+        faster than the method based on for-loop.
+        Args:
+        x (torch.Tensor): The x coordinate.
+        y (torch.Tensor): The y coordinate.
+        z (torch.Tensor): The z coordinate.
+        b (torch.Tensor or int): The batch index of the coordinates, and should be
+            smaller than 32768. If :attr:`b` is :obj:`torch.Tensor`, the size of
+            :attr:`b` must be the same as :attr:`x`, :attr:`y`, and :attr:`z`.
+        depth (int): The depth of the shuffled key, and must be smaller than 17 (< 17).
+        """
+        EX, EY, EZ = self.key_lut.encode_lut(x.device)
+        x, y, z = x.long(), y.long(), z.long()
+        mask = 255 if depth > 8 else (1 << depth) - 1
+        key = EX[x & mask] | EY[y & mask] | EZ[z & mask]
+        if depth > 8:
+            mask = (1 << (depth - 8)) - 1
+            key16 = EX[(x >> 8) & mask] | EY[(y >> 8) & mask] | EZ[(z >> 8) & mask]
+            key = key16 << 24 | key
+        if b is not None:
+            b = b.long()
+            key = b << 48 | key
+        return key
+    def hilbert_encode(self, grid_coord: torch.Tensor, depth: int = 16):
+        return self._hilbert_encode(grid_coord, num_dims=3, num_bits=depth)
+    def _hilbert_encode(self, locs, num_dims, num_bits):
+        """Decode an array of locations in a hypercube into a Hilbert integer.
+        This is a vectorized-ish version of the Hilbert curve implementation by John
+        Skilling as described in:
+        Skilling, J. (2004, April). Programming the Hilbert curve. In AIP Conference
+        Proceedings (Vol. 707, No. 1, pp. 381-387). American Institute of Physics.
+        Params:
+        -------
+        locs - An ndarray of locations in a hypercube of num_dims dimensions, in
+                which each dimension runs from 0 to 2**num_bits-1.  The shape can
+                be arbitrary, as long as the last dimension of the same has size
+                num_dims.
+        num_dims - The dimensionality of the hypercube. Integer.
+        num_bits - The number of bits for each dimension. Integer.
+        Returns:
+        --------
+        The output is an ndarray of uint64 integers with the same shape as the
+        input, excluding the last dimension, which needs to be num_dims.
+        """
+        # Keep around the original shape for later.
+        orig_shape = locs.shape
+        bitpack_mask = 1 << torch.arange(0, 8).to(locs.device)
+        bitpack_mask_rev = bitpack_mask.flip(-1)
+        if orig_shape[-1] != num_dims:
+            raise ValueError(
+                """
+            The shape of locs was surprising in that the last dimension was of size
+            %d, but num_dims=%d.  These need to be equal.
+            """
+                % (orig_shape[-1], num_dims)
+            )
+        if num_dims * num_bits > 63:
+            raise ValueError(
+                """
+            num_dims=%d and num_bits=%d for %d bits total, which can't be encoded
+            into a int64.  Are you sure you need that many points on your Hilbert
+            curve?
+            """
+                % (num_dims, num_bits, num_dims * num_bits)
+            )
+        # Treat the location integers as 64-bit unsigned and then split them up into
+        # a sequence of uint8s.  Preserve the association by dimension.
+        locs_uint8 = locs.long().view(torch.uint8).reshape((-1, num_dims, 8)).flip(-1)
+        # Now turn these into bits and truncate to num_bits.
+        gray = (
+            locs_uint8.unsqueeze(-1)
+            .bitwise_and(bitpack_mask_rev)
+            .ne(0)
+            .byte()
+            .flatten(-2, -1)[..., -num_bits:]
+        )
+        # Run the decoding process the other way.
+        # Iterate forwards through the bits.
+        for bit in range(0, num_bits):
+            # Iterate forwards through the dimensions.
+            for dim in range(0, num_dims):
+                # Identify which ones have this bit active.
+                mask = gray[:, dim, bit]
+                # Where this bit is on, invert the 0 dimension for lower bits.
+                gray[:, 0, bit + 1 :] = torch.logical_xor(
+                    gray[:, 0, bit + 1 :], mask[:, None]
+                )
+                # Where the bit is off, exchange the lower bits with the 0 dimension.
+                to_flip = torch.logical_and(
+                    torch.logical_not(mask[:, None]).repeat(1, gray.shape[2] - bit - 1),
+                    torch.logical_xor(gray[:, 0, bit + 1 :], gray[:, dim, bit + 1 :]),
+                )
+                gray[:, dim, bit + 1 :] = torch.logical_xor(
+                    gray[:, dim, bit + 1 :], to_flip
+                )
+                gray[:, 0, bit + 1 :] = torch.logical_xor(
+                    gray[:, 0, bit + 1 :], to_flip
+                )
+        # Now flatten out.
+        # Fix: shape '[-1, 0]' is invalid for input of size 192
+        gray = gray.swapaxes(1, 2).reshape((gray.size(0), -1))
+        # Convert Gray back to binary.
+        hh_bin = self._gray2binary(gray)
+        # Pad back out to 64 bits.
+        extra_dims = 64 - gray.size(1)
+        padded = torch.nn.functional.pad(hh_bin, (extra_dims, 0), "constant", 0)
+        # Convert binary values into uint8s.
+        hh_uint8 = (
+            (padded.flip(-1).reshape((-1, 8, 8)) * bitpack_mask)
+            .sum(2)
+            .squeeze()
+            .type(torch.uint8)
+        )
+        # Convert uint8s into uint64s.
+        hh_uint64 = hh_uint8.view(torch.int64).squeeze()
+        return hh_uint64
+    def _gray2binary(self, gray, axis=-1):
+        """Convert an array of Gray codes back into binary values.
+        Parameters:
+        -----------
+        gray: An ndarray of gray codes.
+        axis: The axis along which to perform Gray decoding. Default=-1.
+        Returns:
+        --------
+        Returns an ndarray of binary values.
+        """
+        # Loop the log2(bits) number of times necessary, with shift and xor.
+        shift = 2 ** (torch.Tensor([gray.shape[axis]]).log2().ceil().int() - 1)
+        while shift > 0:
+            gray = torch.logical_xor(gray, self._right_shift(gray, shift))
+            shift = torch.div(shift, 2, rounding_mode="floor")
+        return gray
+    def _right_shift(self, binary, k=1, axis=-1):
+        """Right shift an array of binary values.
+        Parameters:
+        -----------
+        binary: An ndarray of binary values.
+        k: The number of bits to shift. Default 1.
+        axis: The axis along which to shift.  Default -1.
+        Returns:
+        --------
+        Returns an ndarray with zero prepended and the ends truncated, along
+        whatever axis was specified."""
+        # If we're shifting the whole thing, just return zeros.
+        if binary.shape[axis] <= k:
+            return torch.zeros_like(binary)
+        # Determine the padding pattern.
+        # padding = [(0,0)] * len(binary.shape)
+        # padding[axis] = (k,0)
+        # Determine the slicing pattern to eliminate just the last one.
+        slicing = [slice(None)] * len(binary.shape)
+        slicing[axis] = slice(None, -k)
+        shifted = torch.nn.functional.pad(
+            binary[tuple(slicing)], (k, 0), mode="constant", value=0
+        )
+        return shifted
+class PointModule(torch.nn.Module):
+    r"""PointModule
+    placeholder, all module subclass from this will take Point in PointSequential.
+    """
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+class Point(addict.Dict):
+    """
+    Point Structure of Pointcept
+    A Point (point cloud) in Pointcept is a dictionary that contains various properties of
+    a batched point cloud. The property with the following names have a specific definition
+    as follows:
+    - "coord": original coordinate of point cloud;
+    - "grid_coord": grid coordinate for specific grid size (related to GridSampling);
+    Point also support the following optional attributes:
+    - "offset": if not exist, initialized as batch size is 1;
+    - "batch": if not exist, initialized as batch size is 1;
+    - "feat": feature of point cloud, default input of model;
+    - "grid_size": Grid size of point cloud (related to GridSampling);
+    (related to Serialization)
+    - "serialized_depth": depth of serialization, 2 ** depth * grid_size describe the maximum of point cloud range;
+    - "serialized_code": a list of serialization codes;
+    - "serialized_order": a list of serialization order determined by code;
+    - "serialized_inverse": a list of inverse mapping determined by code;
+    (related to Sparsify: SpConv)
+    - "sparse_shape": Sparse shape for Sparse Conv Tensor;
+    - "sparse_conv_feat": SparseConvTensor init with information provide by Point;
+    """
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.serializator = Serializator()
+        # If one of "offset" or "batch" do not exist, generate by the existing one
+        if "batch" not in self.keys() and "offset" in self.keys():
+            self["batch"] = offset2batch(self.offset)
+        elif "offset" not in self.keys() and "batch" in self.keys():
+            self["offset"] = batch2offset(self.batch)
+    def serialization(self, order="z", depth=None, shuffle_orders=False):
+        """
+        Point Cloud Serialization
+        relay on ["grid_coord" or "coord" + "grid_size", "batch", "feat"]
+        """
+        assert "batch" in self.keys()
+        if "grid_coord" not in self.keys():
+            # if you don't want to operate GridSampling in data augmentation,
+            # please add the following augmentation into your pipline:
+            # dict(type="Copy", keys_dict={"grid_size": 0.01}),
+            # (adjust `grid_size` to what your want)
+            assert {"grid_size", "coord"}.issubset(self.keys())
+            self["grid_coord"] = torch.div(
+                self.coord - self.coord.min(0)[0], self.grid_size, rounding_mode="trunc"
+            ).int()
+        if depth is None:
+            # Adaptive measure the depth of serialization cube (length = 2 ^ depth)
+            depth = int(self.grid_coord.max()).bit_length()
+        self["serialized_depth"] = depth
+        # Maximum bit length for serialization code is 63 (int64)
+        assert depth * 3 + len(self.offset).bit_length() <= 63
+        # Here we follow OCNN and set the depth limitation to 16 (48bit) for the point position.
+        # Although depth is limited to less than 16, we can encode a 655.36^3 (2^16 * 0.01) meter^3
+        # cube with a grid size of 0.01 meter. We consider it is enough for the current stage.
+        # We can unlock the limitation by optimizing the z-order encoding function if necessary.
+        assert depth <= 16
+        # The serialization codes are arranged as following structures:
+        # [Order1 ([n]),
+        #  Order2 ([n]),
+        #   ...
+        #  OrderN ([n])] (k, n)
+        code = [
+            self.serializator.encode(
+                self.grid_coord, self.grid_size, self.batch, depth, order=order_
+            )
+            for order_ in order
+        ]
+        code = torch.stack(code)
+        order = torch.argsort(code)
+        inverse = torch.zeros_like(order).scatter_(
+            dim=1,
+            index=order,
+            src=torch.arange(0, code.shape[1], device=order.device).repeat(
+                code.shape[0], 1
+            ),
+        )
+        if shuffle_orders:
+            perm = torch.randperm(code.shape[0])
+            code = code[perm]
+            order = order[perm]
+            inverse = inverse[perm]
+        self["serialized_code"] = code
+        self["serialized_order"] = order
+        self["serialized_inverse"] = inverse
+    def sparsify(self, pad=96):
+        """
+        Point Cloud Serialization
+        Point cloud is sparse, here we use "sparsify" to specifically refer to
+        preparing "spconv.SparseConvTensor" for SpConv.
+        relay on ["grid_coord" or "coord" + "grid_size", "batch", "feat"]
+        pad: padding sparse for sparse shape.
+        """
+        assert {"feat", "batch"}.issubset(self.keys())
+        if "grid_coord" not in self.keys():
+            # if you don't want to operate GridSampling in data augmentation,
+            # please add the following augmentation into your pipline:
+            # dict(type="Copy", keys_dict={"grid_size": 0.01}),
+            # (adjust `grid_size` to what your want)
+            assert {"grid_size", "coord"}.issubset(self.keys())
+            self["grid_coord"] = torch.div(
+                self.coord - self.coord.min(0)[0], self.grid_size, rounding_mode="trunc"
+            ).int()
+        if "sparse_shape" in self.keys():
+            sparse_shape = self.sparse_shape
+        else:
+            sparse_shape = torch.add(
+                torch.max(self.grid_coord, dim=0).values, pad
+            ).tolist()
+        sparse_conv_feat = spconv.SparseConvTensor(
+            features=self.feat,
+            indices=torch.cat(
+                [self.batch.unsqueeze(-1).int(), self.grid_coord.int()], dim=1
+            ).contiguous(),
+            spatial_shape=sparse_shape,
+            batch_size=self.batch[-1].tolist() + 1,
+        )
+        self["sparse_shape"] = sparse_shape
+        self["sparse_conv_feat"] = sparse_conv_feat
+class PointSequential(PointModule):
+    r"""A sequential container.
+    Modules will be added to it in the order they are passed in the constructor.
+    Alternatively, an ordered dict of modules can also be passed in.
+    """
+    def __init__(self, name="", *args, **kwargs):
+        super().__init__()
+        self.name = name
+        if len(args) == 1 and isinstance(args[0], collections.OrderedDict):
+            for key, module in args[0].items():
+                self.add_module(key, module)
+        else:
+            for idx, module in enumerate(args):
+                self.add_module(str(idx), module)
+        for name, module in kwargs.items():
+            if name in self._modules:
+                raise ValueError("name exists.")
+            self.add_module(name, module)
+    def __getitem__(self, idx):
+        if not (-len(self) <= idx < len(self)):
+            raise IndexError("index {} is out of range".format(idx))
+        if idx < 0:
+            idx += len(self)
+        it = iter(self._modules.values())
+        for i in range(idx):
+            next(it)
+        return next(it)
+    def __len__(self):
+        return len(self._modules)
+    def add(self, module, name=None):
+        if name is None:
+            name = str(len(self._modules))
+            if name in self._modules:
+                raise KeyError("name exists")
+        self.add_module(name, module)
+    def forward(self, x):
+        for module in self._modules.values():
+            # Point module
+            if isinstance(module, PointModule):
+                x = module(x)
+            # Spconv module
+            elif spconv.modules.is_spconv_module(module):
+                if isinstance(x, Point):
+                    x.sparse_conv_feat = module(x.sparse_conv_feat)
+                    x.feat = x.sparse_conv_feat.features
+                else:
+                    x = module(x)
+            # Fix: Expected more than 1 value per channel when training
+            elif isinstance(module, torch.nn.BatchNorm1d) and isinstance(x, Point):
+                if x.feat.size(0) != 1:
+                    x.feat = module(x.feat)
+            # PyTorch module
+            else:
+                if isinstance(x, Point):
+                    x.feat = module(x.feat)
+                    if "sparse_conv_feat" in x.keys():
+                        x.sparse_conv_feat = x.sparse_conv_feat.replace_feature(x.feat)
+                elif isinstance(x, spconv.SparseConvTensor):
+                    if x.indices.shape[0] != 0:
+                        x = x.replace_feature(module(x.features))
+                else:
+                    x = module(x)
+        return x
+class PDNorm(PointModule):
+    def __init__(
+        self,
+        num_features,
+        norm_layer,
+        context_channels=256,
+        conditions=("ScanNet", "S3DIS", "Structured3D"),
+        decouple=True,
+        adaptive=False,
+    ):
+        super().__init__()
+        self.conditions = conditions
+        self.decouple = decouple
+        self.adaptive = adaptive
+        if self.decouple:
+            self.norm = torch.nn.ModuleList(
+                [norm_layer(num_features) for _ in conditions]
+            )
+        else:
+            self.norm = norm_layer
+        if self.adaptive:
+            self.modulation = torch.nn.Sequential(
+                torch.nn.SiLU(),
+                torch.nn.Linear(context_channels, 2 * num_features, bias=True),
+            )
+    def forward(self, point):
+        assert {"feat", "condition"}.issubset(point.keys())
+        if isinstance(point.condition, str):
+            condition = point.condition
+        else:
+            condition = point.condition[0]
+        if self.decouple:
+            assert condition in self.conditions
+            norm = self.norm[self.conditions.index(condition)]
+        else:
+            norm = self.norm
+        point.feat = norm(point.feat)
+        if self.adaptive:
+            assert "context" in point.keys()
+            shift, scale = self.modulation(point.context).chunk(2, dim=1)
+            point.feat = point.feat * (1.0 + scale) + shift
+        return point
+class RPE(torch.nn.Module):
+    def __init__(self, patch_size, num_heads):
+        super().__init__()
+        self.patch_size = patch_size
+        self.num_heads = num_heads
+        self.pos_bnd = int((4 * patch_size) ** (1 / 3) * 2)
+        self.rpe_num = 2 * self.pos_bnd + 1
+        self.rpe_table = torch.nn.Parameter(torch.zeros(3 * self.rpe_num, num_heads))
+        torch.nn.init.trunc_normal_(self.rpe_table, std=0.02)
+    def forward(self, coord):
+        idx = (
+            coord.clamp(-self.pos_bnd, self.pos_bnd)  # clamp into bnd
+            + self.pos_bnd  # relative position to positive index
+            + torch.arange(3, device=coord.device) * self.rpe_num  # x, y, z stride
+        )
+        out = self.rpe_table.index_select(0, idx.reshape(-1))
+        out = out.view(idx.shape + (-1,)).sum(3)
+        out = out.permute(0, 3, 1, 2)  # (N, K, K, H) -> (N, H, K, K)
+        return out
+class SerializedAttention(PointModule):
+    def __init__(
+        self,
+        channels,
+        num_heads,
+        patch_size,
+        qkv_bias=True,
+        qk_scale=None,
+        attn_drop=0.0,
+        proj_drop=0.0,
+        order_index=0,
+        enable_rpe=False,
+        enable_flash=True,
+        upcast_attention=True,
+        upcast_softmax=True,
+    ):
+        super().__init__()
+        assert channels % num_heads == 0
+        self.channels = channels
+        self.num_heads = num_heads
+        self.scale = qk_scale or (channels // num_heads) ** -0.5
+        self.order_index = order_index
+        self.upcast_attention = upcast_attention
+        self.upcast_softmax = upcast_softmax
+        self.enable_rpe = enable_rpe
+        self.enable_flash = enable_flash
+        if enable_flash:
+            assert (
+                enable_rpe is False
+            ), "Set enable_rpe to False when enable Flash Attention"
+            assert (
+                upcast_attention is False
+            ), "Set upcast_attention to False when enable Flash Attention"
+            assert (
+                upcast_softmax is False
+            ), "Set upcast_softmax to False when enable Flash Attention"
+            assert flash_attn is not None, "Make sure flash_attn is installed."
+            self.patch_size = patch_size
+            self.attn_drop = attn_drop
+        else:
+            # when disable flash attention, we still don't want to use mask
+            # consequently, patch size will auto set to the
+            # min number of patch_size_max and number of points
+            self.patch_size_max = patch_size
+            self.patch_size = 0
+            self.attn_drop = torch.nn.Dropout(attn_drop)
+        self.qkv = torch.nn.Linear(channels, channels * 3, bias=qkv_bias)
+        self.proj = torch.nn.Linear(channels, channels)
+        self.proj_drop = torch.nn.Dropout(proj_drop)
+        self.softmax = torch.nn.Softmax(dim=-1)
+        self.rpe = RPE(patch_size, num_heads) if self.enable_rpe else None
+    @torch.no_grad()
+    def get_rel_pos(self, point, order):
+        K = self.patch_size
+        rel_pos_key = f"rel_pos_{self.order_index}"
+        if rel_pos_key not in point.keys():
+            grid_coord = point.grid_coord[order]
+            grid_coord = grid_coord.reshape(-1, K, 3)
+            point[rel_pos_key] = grid_coord.unsqueeze(2) - grid_coord.unsqueeze(1)
+        return point[rel_pos_key]
+    @torch.no_grad()
+    def get_padding_and_inverse(self, point):
+        pad_key = "pad"
+        unpad_key = "unpad"
+        cu_seqlens_key = "cu_seqlens_key"
+        if (
+            pad_key not in point.keys()
+            or unpad_key not in point.keys()
+            or cu_seqlens_key not in point.keys()
+        ):
+            offset = point.offset
+            bincount = offset2bincount(offset)
+            bincount_pad = (
+                torch.div(
+                    bincount + self.patch_size - 1,
+                    self.patch_size,
+                    rounding_mode="trunc",
+                )
+                * self.patch_size
+            )
+            # only pad point when num of points larger than patch_size
+            mask_pad = bincount > self.patch_size
+            bincount_pad = ~mask_pad * bincount + mask_pad * bincount_pad
+            _offset = torch.nn.functional.pad(offset, (1, 0))
+            _offset_pad = torch.nn.functional.pad(
+                torch.cumsum(bincount_pad, dim=0), (1, 0)
+            )
+            pad = torch.arange(_offset_pad[-1], device=offset.device)
+            unpad = torch.arange(_offset[-1], device=offset.device)
+            cu_seqlens = []
+            for i in range(len(offset)):
+                unpad[_offset[i] : _offset[i + 1]] += _offset_pad[i] - _offset[i]
+                if bincount[i] != bincount_pad[i]:
+                    pad[
+                        _offset_pad[i + 1]
+                        - self.patch_size
+                        + (bincount[i] % self.patch_size) : _offset_pad[i + 1]
+                    ] = pad[
+                        _offset_pad[i + 1]
+                        - 2 * self.patch_size
+                        + (bincount[i] % self.patch_size) : _offset_pad[i + 1]
+                        - self.patch_size
+                    ]
+                pad[_offset_pad[i] : _offset_pad[i + 1]] -= _offset_pad[i] - _offset[i]
+                cu_seqlens.append(
+                    torch.arange(
+                        _offset_pad[i],
+                        _offset_pad[i + 1],
+                        step=self.patch_size,
+                        dtype=torch.int32,
+                        device=offset.device,
+                    )
+                )
+            point[pad_key] = pad
+            point[unpad_key] = unpad
+            point[cu_seqlens_key] = torch.nn.functional.pad(
+                torch.concat(cu_seqlens), (0, 1), value=_offset_pad[-1]
+            )
+        return point[pad_key], point[unpad_key], point[cu_seqlens_key]
+    def forward(self, point):
+        if not self.enable_flash:
+            self.patch_size = min(
+                offset2bincount(point.offset).min().tolist(), self.patch_size_max
+            )
+        H = self.num_heads
+        K = self.patch_size
+        C = self.channels
+        pad, unpad, cu_seqlens = self.get_padding_and_inverse(point)
+        order = point.serialized_order[self.order_index][pad]
+        inverse = unpad[point.serialized_inverse[self.order_index]]
+        # padding and reshape feat and batch for serialized point patch
+        qkv = self.qkv(point.feat)[order]
+        if not self.enable_flash:
+            # encode and reshape qkv: (N', K, 3, H, C') => (3, N', H, K, C')
+            q, k, v = (
+                qkv.reshape(-1, K, 3, H, C // H).permute(2, 0, 3, 1, 4).unbind(dim=0)
+            )
+            # attn
+            if self.upcast_attention:
+                q = q.float()
+                k = k.float()
+            attn = (q * self.scale) @ k.transpose(-2, -1)  # (N', H, K, K)
+            if self.enable_rpe:
+                attn = attn + self.rpe(self.get_rel_pos(point, order))
+            if self.upcast_softmax:
+                attn = attn.float()
+            attn = self.softmax(attn)
+            attn = self.attn_drop(attn).to(qkv.dtype)
+            feat = (attn @ v).transpose(1, 2).reshape(-1, C)
+        else:
+            feat = flash_attn.flash_attn_varlen_qkvpacked_func(
+                qkv.half().reshape(-1, 3, H, C // H),
+                cu_seqlens,
+                max_seqlen=self.patch_size,
+                dropout_p=self.attn_drop if self.training else 0,
+                softmax_scale=self.scale,
+            ).reshape(-1, C)
+            feat = feat.to(qkv.dtype)
+        feat = feat[inverse]
+        # ffn
+        feat = self.proj(feat)
+        feat = self.proj_drop(feat)
+        point.feat = feat
+        return point
+class MLP(torch.nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        hidden_channels=None,
+        out_channels=None,
+        act_layer=torch.nn.GELU,
+        drop=0.0,
+    ):
+        super().__init__()
+        out_channels = out_channels or in_channels
+        hidden_channels = hidden_channels or in_channels
+        self.fc1 = torch.nn.Linear(in_channels, hidden_channels)
+        self.act = act_layer()
+        self.fc2 = torch.nn.Linear(hidden_channels, out_channels)
+        # self.drop = torch.nn.Dropout(drop)
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        # x = self.drop(x)
+        x = self.fc2(x)
+        # x = self.drop(x)
+        return x
+class Block(PointModule):
+    def __init__(
+        self,
+        channels,
+        num_heads,
+        patch_size=48,
+        mlp_ratio=4.0,
+        qkv_bias=True,
+        qk_scale=None,
+        attn_drop=0.0,
+        proj_drop=0.0,
+        drop_path=0.0,
+        norm_layer=torch.nn.LayerNorm,
+        act_layer=torch.nn.GELU,
+        pre_norm=True,
+        order_index=0,
+        cpe_indice_key=None,
+        enable_rpe=False,
+        enable_flash=True,
+        upcast_attention=True,
+        upcast_softmax=True,
+    ):
+        super().__init__()
+        self.channels = channels
+        self.pre_norm = pre_norm
+        self.cpe = PointSequential(
+            spconv.SubMConv3d(
+                channels,
+                channels,
+                kernel_size=3,
+                bias=True,
+                indice_key=cpe_indice_key,
+            ),
+            torch.nn.Linear(channels, channels),
+            norm_layer(channels),
+        )
+        self.norm1 = PointSequential(norm_layer(channels))
+        self.attn = SerializedAttention(
+            channels=channels,
+            patch_size=patch_size,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            qk_scale=qk_scale,
+            attn_drop=attn_drop,
+            proj_drop=proj_drop,
+            order_index=order_index,
+            enable_rpe=enable_rpe,
+            enable_flash=enable_flash,
+            upcast_attention=upcast_attention,
+            upcast_softmax=upcast_softmax,
+        )
+        self.norm2 = PointSequential(norm_layer(channels))
+        self.mlp = PointSequential(
+            MLP(
+                in_channels=channels,
+                hidden_channels=int(channels * mlp_ratio),
+                out_channels=channels,
+                act_layer=act_layer,
+                drop=proj_drop,
+            )
+        )
+        self.drop_path = PointSequential(
+            DropPath(drop_path) if drop_path > 0.0 else torch.nn.Identity()
+        )
+    def forward(self, point: Point):
+        shortcut = point.feat
+        point = self.cpe(point)
+        point.feat = shortcut + point.feat
+        shortcut = point.feat
+        if self.pre_norm:
+            point = self.norm1(point)
+        point = self.drop_path(self.attn(point))
+        point.feat = shortcut + point.feat
+        if not self.pre_norm:
+            point = self.norm1(point)
+        shortcut = point.feat
+        if self.pre_norm:
+            point = self.norm2(point)
+        point = self.drop_path(self.mlp(point))
+        point.feat = shortcut + point.feat
+        if not self.pre_norm:
+            point = self.norm2(point)
+        point.sparse_conv_feat = point.sparse_conv_feat.replace_feature(point.feat)
+        return point
+class DropPath(torch.nn.Module):
+    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks)."""
+    def __init__(self, drop_prob=None, scale_by_keep=True):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+        self.scale_by_keep = scale_by_keep
+    def _drop_path(
+        self,
+        x,
+        drop_prob: float = 0.0,
+        training: bool = False,
+        scale_by_keep: bool = True,
+    ):
+        """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+        This is the same as the DropConnect impl I created for EfficientNet, etc networks, however,
+        the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
+        See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for
+        changing the layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use
+        'survival rate' as the argument.
+        """
+        if drop_prob == 0.0 or not training:
+            return x
+        keep_prob = 1 - drop_prob
+        shape = (x.shape[0],) + (1,) * (
+            x.ndim - 1
+        )  # work with diff dim tensors, not just 2D ConvNets
+        random_tensor = x.new_empty(shape).bernoulli_(keep_prob)
+        if keep_prob > 0.0 and scale_by_keep:
+            random_tensor.div_(keep_prob)
+        return x * random_tensor
+    def forward(self, x):
+        return self._drop_path(x, self.drop_prob, self.training, self.scale_by_keep)
+class SerializedPooling(PointModule):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        stride=2,
+        norm_layer=None,
+        act_layer=None,
+        reduce="max",
+        shuffle_orders=True,
+        traceable=True,  # record parent and cluster
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        assert stride == 2 ** (math.ceil(stride) - 1).bit_length()  # 2, 4, 8
+        # TODO: add support to grid pool (any stride)
+        self.stride = stride
+        assert reduce in ["sum", "mean", "min", "max"]
+        self.reduce = reduce
+        self.shuffle_orders = shuffle_orders
+        self.traceable = traceable
+        self.proj = torch.nn.Linear(in_channels, out_channels)
+        if norm_layer is not None:
+            self.norm = PointSequential(norm_layer(out_channels))
+        if act_layer is not None:
+            self.act = PointSequential(act_layer())
+    def forward(self, point: Point):
+        pooling_depth = (math.ceil(self.stride) - 1).bit_length()
+        if pooling_depth > point.serialized_depth:
+            pooling_depth = 0
+        assert {
+            "serialized_code",
+            "serialized_order",
+            "serialized_inverse",
+            "serialized_depth",
+        }.issubset(
+            point.keys()
+        ), "Run point.serialization() point cloud before SerializedPooling"
+        code = point.serialized_code >> pooling_depth * 3
+        code_, cluster, counts = torch.unique(
+            code[0],
+            sorted=True,
+            return_inverse=True,
+            return_counts=True,
+        )
+        # indices of point sorted by cluster, for torch_scatter.segment_csr
+        _, indices = torch.sort(cluster)
+        # index pointer for sorted point, for torch_scatter.segment_csr
+        idx_ptr = torch.cat([counts.new_zeros(1), torch.cumsum(counts, dim=0)])
+        # head_indices of each cluster, for reduce attr e.g. code, batch
+        head_indices = indices[idx_ptr[:-1]]
+        # generate down code, order, inverse
+        code = code[:, head_indices]
+        order = torch.argsort(code)
+        inverse = torch.zeros_like(order).scatter_(
+            dim=1,
+            index=order,
+            src=torch.arange(0, code.shape[1], device=order.device).repeat(
+                code.shape[0], 1
+            ),
+        )
+        if self.shuffle_orders:
+            perm = torch.randperm(code.shape[0])
+            code = code[perm]
+            order = order[perm]
+            inverse = inverse[perm]
+        # collect information
+        point_dict = addict.Dict(
+            feat=torch_scatter.segment_csr(
+                self.proj(point.feat)[indices], idx_ptr, reduce=self.reduce
+            ),
+            coord=torch_scatter.segment_csr(
+                point.coord[indices], idx_ptr, reduce="mean"
+            ),
+            grid_coord=point.grid_coord[head_indices] >> pooling_depth,
+            serialized_code=code,
+            serialized_order=order,
+            serialized_inverse=inverse,
+            serialized_depth=point.serialized_depth - pooling_depth,
+            batch=point.batch[head_indices],
+        )
+        if "condition" in point.keys():
+            point_dict["condition"] = point.condition
+        if "context" in point.keys():
+            point_dict["context"] = point.context
+        if self.traceable:
+            point_dict["pooling_inverse"] = cluster
+            point_dict["pooling_parent"] = point
+        point = Point(point_dict)
+        # Fix: Expected more than 1 value per channel when training
+        if self.norm is not None and point.feat.size(0) != 1:
+            point = self.norm(point)
+        if self.act is not None:
+            point = self.act(point)
+        point.sparsify()
+        return point
+class SerializedUnpooling(PointModule):
+    def __init__(
+        self,
+        in_channels,
+        skip_channels,
+        out_channels,
+        norm_layer=None,
+        act_layer=None,
+        traceable=False,  # record parent and cluster
+    ):
+        super().__init__()
+        self.proj = PointSequential(torch.nn.Linear(in_channels, out_channels))
+        self.proj_skip = PointSequential(torch.nn.Linear(skip_channels, out_channels))
+        if norm_layer is not None:
+            self.proj.add(norm_layer(out_channels))
+            self.proj_skip.add(norm_layer(out_channels))
+        if act_layer is not None:
+            self.proj.add(act_layer())
+            self.proj_skip.add(act_layer())
+        self.traceable = traceable
+    def forward(self, point):
+        assert "pooling_parent" in point.keys()
+        assert "pooling_inverse" in point.keys()
+        parent = point.pop("pooling_parent")
+        inverse = point.pop("pooling_inverse")
+        point = self.proj(point)
+        parent = self.proj_skip(parent)
+        parent.feat = parent.feat + point.feat[inverse]
+        if self.traceable:
+            parent["unpooling_parent"] = point
+        return parent
+class Embedding(PointModule):
+    def __init__(
+        self,
+        in_channels,
+        embed_channels,
+        norm_layer=None,
+        act_layer=None,
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        self.embed_channels = embed_channels
+        # TODO: check remove spconv
+        self.stem = PointSequential(
+            conv=spconv.SubMConv3d(
+                in_channels,
+                embed_channels,
+                kernel_size=5,
+                padding=1,
+                bias=False,
+                indice_key="stem",
+            )
+        )
+        if norm_layer is not None:
+            self.stem.add(norm_layer(embed_channels), name="norm")
+        if act_layer is not None:
+            self.stem.add(act_layer(), name="act")
+    def forward(self, point: Point):
+        point = self.stem(point)
+        return point
+class PointTransformerV3(PointModule):
+    def __init__(
+        self,
+        in_channels=6,
+        order=("cord"),
+        stride=(2, 2, 2, 2),
+        enc_depths=(2, 2, 2, 6, 2),
+        enc_channels=(32, 64, 128, 256, 512),
+        enc_num_head=(2, 4, 8, 16, 32),
+        enc_patch_size=(1024, 1024, 1024, 1024, 1024),
+        dec_depths=(2, 2, 2, 2),
+        dec_channels=(64, 64, 128, 256),
+        dec_num_head=(4, 4, 8, 16),
+        dec_patch_size=(1024, 1024, 1024, 1024),
+        mlp_ratio=4,
+        grid_size=0.01,
+        qkv_bias=True,
+        qk_scale=None,
+        attn_drop=0.0,
+        proj_drop=0.0,
+        drop_path=0.3,
+        pre_norm=True,
+        shuffle_orders=True,
+        enable_rpe=False,
+        enable_flash=True,
+        upcast_attention=False,
+        upcast_softmax=False,
+        cls_mode=False,
+        pdnorm_bn=False,
+        pdnorm_ln=False,
+        pdnorm_decouple=True,
+        pdnorm_adaptive=False,
+        pdnorm_affine=True,
+        pdnorm_conditions=("ScanNet", "S3DIS", "Structured3D"),
+    ):
+        super().__init__()
+        self.num_stages = len(enc_depths)
+        self.order = [order] if isinstance(order, str) else order
+        self.cls_mode = cls_mode
+        self.shuffle_orders = shuffle_orders
+        self.grid_size = grid_size
+        assert self.num_stages == len(stride) + 1
+        assert self.num_stages == len(enc_depths)
+        assert self.num_stages == len(enc_channels)
+        assert self.num_stages == len(enc_num_head)
+        assert self.num_stages == len(enc_patch_size)
+        assert self.cls_mode or self.num_stages == len(dec_depths) + 1
+        assert self.cls_mode or self.num_stages == len(dec_channels) + 1
+        assert self.cls_mode or self.num_stages == len(dec_num_head) + 1
+        assert self.cls_mode or self.num_stages == len(dec_patch_size) + 1
+        # norm layers
+        if pdnorm_bn:
+            bn_layer = functools.partial(
+                PDNorm,
+                norm_layer=functools.partial(
+                    torch.nn.BatchNorm1d, eps=1e-3, momentum=0.01, affine=pdnorm_affine
+                ),
+                conditions=pdnorm_conditions,
+                decouple=pdnorm_decouple,
+                adaptive=pdnorm_adaptive,
+            )
+        else:
+            bn_layer = functools.partial(torch.nn.BatchNorm1d, eps=1e-3, momentum=0.01)
+        if pdnorm_ln:
+            ln_layer = functools.partial(
+                PDNorm,
+                norm_layer=functools.partial(
+                    torch.nn.LayerNorm, elementwise_affine=pdnorm_affine
+                ),
+                conditions=pdnorm_conditions,
+                decouple=pdnorm_decouple,
+                adaptive=pdnorm_adaptive,
+            )
+        else:
+            ln_layer = torch.nn.LayerNorm
+        # activation layers
+        act_layer = torch.nn.GELU
+        self.embedding = Embedding(
+            in_channels=in_channels,
+            embed_channels=enc_channels[0],
+            norm_layer=bn_layer,
+            act_layer=act_layer,
+        )
+        # encoder
+        enc_drop_path = [
+            x.item() for x in torch.linspace(0, drop_path, sum(enc_depths))
+        ]
+        self.enc = PointSequential(name="encoder")
+        for s in range(self.num_stages):
+            enc_drop_path_ = enc_drop_path[
+                sum(enc_depths[:s]) : sum(enc_depths[: s + 1])
+            ]
+            enc = PointSequential(name="encoder_layer_%d" % s)
+            if s > 0:
+                enc.add(
+                    SerializedPooling(
+                        in_channels=enc_channels[s - 1],
+                        out_channels=enc_channels[s],
+                        stride=stride[s - 1],
+                        norm_layer=bn_layer,
+                        act_layer=act_layer,
+                    ),
+                    name="down",
+                )
+            for i in range(enc_depths[s]):
+                enc.add(
+                    Block(
+                        channels=enc_channels[s],
+                        num_heads=enc_num_head[s],
+                        patch_size=enc_patch_size[s],
+                        mlp_ratio=mlp_ratio,
+                        qkv_bias=qkv_bias,
+                        qk_scale=qk_scale,
+                        attn_drop=attn_drop,
+                        proj_drop=proj_drop,
+                        drop_path=enc_drop_path_[i],
+                        norm_layer=ln_layer,
+                        act_layer=act_layer,
+                        pre_norm=pre_norm,
+                        order_index=i % len(self.order),
+                        cpe_indice_key=f"stage{s}",
+                        enable_rpe=enable_rpe,
+                        enable_flash=enable_flash,
+                        upcast_attention=upcast_attention,
+                        upcast_softmax=upcast_softmax,
+                    ),
+                    name=f"block{i}",
+                )
+            if len(enc) != 0:
+                self.enc.add(module=enc, name=f"enc{s}")
+        # decoder
+        if not self.cls_mode:
+            dec_drop_path = [
+                x.item() for x in torch.linspace(0, drop_path, sum(dec_depths))
+            ]
+            self.dec = PointSequential(name="decoder")
+            dec_channels = list(dec_channels) + [enc_channels[-1]]
+            for s in reversed(range(self.num_stages - 1)):
+                dec_drop_path_ = dec_drop_path[
+                    sum(dec_depths[:s]) : sum(dec_depths[: s + 1])
+                ]
+                dec_drop_path_.reverse()
+                dec = PointSequential(name="decoder_layer_%d" % s)
+                dec.add(
+                    SerializedUnpooling(
+                        in_channels=dec_channels[s + 1],
+                        skip_channels=enc_channels[s],
+                        out_channels=dec_channels[s],
+                        norm_layer=bn_layer,
+                        act_layer=act_layer,
+                    ),
+                    name="up",
+                )
+                for i in range(dec_depths[s]):
+                    dec.add(
+                        Block(
+                            channels=dec_channels[s],
+                            num_heads=dec_num_head[s],
+                            patch_size=dec_patch_size[s],
+                            mlp_ratio=mlp_ratio,
+                            qkv_bias=qkv_bias,
+                            qk_scale=qk_scale,
+                            attn_drop=attn_drop,
+                            proj_drop=proj_drop,
+                            drop_path=dec_drop_path_[i],
+                            norm_layer=ln_layer,
+                            act_layer=act_layer,
+                            pre_norm=pre_norm,
+                            order_index=i % len(self.order),
+                            cpe_indice_key=f"stage{s}",
+                            enable_rpe=enable_rpe,
+                            enable_flash=enable_flash,
+                            upcast_attention=upcast_attention,
+                            upcast_softmax=upcast_softmax,
+                        ),
+                        name=f"block{i}",
+                    )
+                self.dec.add(module=dec, name=f"dec{s}")
+    def forward(self, batch, feat, coord):
+        """
+        A data_dict is a dictionary containing properties of a batched point cloud.
+        It should contain the following properties for PTv3:
+        1. "feat": feature of point cloud
+        2. "grid_coord": discrete coordinate after grid sampling (voxelization) or "coord" + "grid_size"
+        3. "offset" or "batch": https://github.com/Pointcept/Pointcept?tab=readme-ov-file#offset
+        """
+        point = Point(
+            {
+                "batch": batch.squeeze(dim=0),
+                "feat": feat.squeeze(dim=0),
+                "coord": coord.squeeze(dim=0),
+                "grid_size": self.grid_size,
+            }
+        )
+        point.serialization(order=self.order, shuffle_orders=self.shuffle_orders)
+        point.sparsify()
+        point = self.embedding(point)
+        point = self.enc(point)
+        if not self.cls_mode:
+            point = self.dec(point)
+        return point.feat.unsqueeze(dim=0)

requirements.txt ADDED Viewed

	@@ -0,0 +1,14 @@

+--extra-index-url https://download.pytorch.org/whl/cu121
+easydict
+gradio
+numpy<2.0.0
+opencv-python
+pillow
+scipy
+torch==2.2.2
+torchvision==0.17.2
+addict
+spconv-cu121
+torch_scatter