[ROCm] add support for ROCm/HIP (#509)

* [ROCm] add support for ROCm/HIP

- do not use THC.h or THH.h or THCState
- rename files to avoid collisions at build time due to hipify
- HIP host device definitions in tensorview.h
- use HIP atomicAdd() in scatter_points_cuda.cu
- add WITH_ROCM anywhere WITH_CUDA was used
- update setup.py for renamed files and ROCm/HIP torch

* enforce c++17 since pytorch requires it
This commit is contained in:
Jeff Daily 2024-07-30 21:04:51 -07:00 committed by GitHub
parent 5b08cc8cee
commit 326653dc06
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
15 changed files with 43 additions and 36 deletions

4
.gitignore vendored
View File

@ -132,3 +132,7 @@ dmypy.json
models/* models/*
data/* data/*
runs/* runs/*
# torch hipify generated files
*_hip.*
*.hip

View File

@ -1,16 +1,14 @@
// Modified from // Modified from
// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/ball_query.cpp // https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/ball_query.cpp
#include <THC/THC.h>
#include <cuda.h> #include <cuda.h>
#include <cuda_runtime_api.h> #include <cuda_runtime_api.h>
#include <torch/extension.h> #include <torch/extension.h>
#include <torch/serialize/tensor.h> #include <torch/serialize/tensor.h>
#include <c10/cuda/CUDAStream.h>
#include <vector> #include <vector>
extern THCState *state;
#define CHECK_CUDA(x) \ #define CHECK_CUDA(x) \
TORCH_CHECK(x.type().is_cuda(), #x, " must be a CUDAtensor ") TORCH_CHECK(x.type().is_cuda(), #x, " must be a CUDAtensor ")
#define CHECK_CONTIGUOUS(x) \ #define CHECK_CONTIGUOUS(x) \

View File

@ -2,14 +2,11 @@
// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/sampling.cpp // https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/sampling.cpp
#include <ATen/cuda/CUDAContext.h> #include <ATen/cuda/CUDAContext.h>
#include <THC/THC.h>
#include <torch/extension.h> #include <torch/extension.h>
#include <torch/serialize/tensor.h> #include <torch/serialize/tensor.h>
#include <c10/cuda/CUDAStream.h>
#include <vector> #include <vector>
extern THCState *state;
int furthest_point_sampling_wrapper(int b, int n, int m, int furthest_point_sampling_wrapper(int b, int n, int m,
at::Tensor points_tensor, at::Tensor points_tensor,
at::Tensor temp_tensor, at::Tensor temp_tensor,

View File

@ -1,12 +1,10 @@
#include <ATen/cuda/CUDAContext.h> #include <ATen/cuda/CUDAContext.h>
#include <THC/THC.h>
#include <torch/extension.h> #include <torch/extension.h>
#include <torch/serialize/tensor.h> #include <torch/serialize/tensor.h>
#include <c10/cuda/CUDAStream.h>
#include <vector> #include <vector>
extern THCState *state;
int gather_points_wrapper(int b, int c, int n, int npoints, int gather_points_wrapper(int b, int c, int n, int npoints,
at::Tensor points_tensor, at::Tensor idx_tensor, at::Tensor points_tensor, at::Tensor idx_tensor,
at::Tensor out_tensor); at::Tensor out_tensor);

View File

@ -1,16 +1,14 @@
// Modified from // Modified from
// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/group_points.cpp // https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/group_points.cpp
#include <THC/THC.h>
#include <cuda.h> #include <cuda.h>
#include <cuda_runtime_api.h> #include <cuda_runtime_api.h>
#include <torch/extension.h> #include <torch/extension.h>
#include <torch/serialize/tensor.h> #include <torch/serialize/tensor.h>
#include <c10/cuda/CUDAStream.h>
#include <vector> #include <vector>
extern THCState *state;
int group_points_wrapper(int b, int c, int n, int npoints, int nsample, int group_points_wrapper(int b, int c, int n, int npoints, int nsample,
at::Tensor points_tensor, at::Tensor idx_tensor, at::Tensor points_tensor, at::Tensor idx_tensor,
at::Tensor out_tensor); at::Tensor out_tensor);

View File

@ -1,7 +1,6 @@
// Modified from // Modified from
// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/interpolate.cpp // https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/interpolate.cpp
#include <THC/THC.h>
#include <cuda.h> #include <cuda.h>
#include <cuda_runtime_api.h> #include <cuda_runtime_api.h>
#include <math.h> #include <math.h>
@ -9,11 +8,10 @@
#include <stdlib.h> #include <stdlib.h>
#include <torch/extension.h> #include <torch/extension.h>
#include <torch/serialize/tensor.h> #include <torch/serialize/tensor.h>
#include <c10/cuda/CUDAStream.h>
#include <vector> #include <vector>
extern THCState *state;
void three_nn_wrapper(int b, int n, int m, at::Tensor unknown_tensor, void three_nn_wrapper(int b, int n, int m, at::Tensor unknown_tensor,
at::Tensor known_tensor, at::Tensor dist2_tensor, at::Tensor known_tensor, at::Tensor dist2_tensor,
at::Tensor idx_tensor); at::Tensor idx_tensor);

View File

@ -3,10 +3,8 @@
#include <torch/serialize/tensor.h> #include <torch/serialize/tensor.h>
#include <torch/extension.h> #include <torch/extension.h>
#include <vector> #include <vector>
#include <THC/THC.h>
#include <ATen/cuda/CUDAContext.h> #include <ATen/cuda/CUDAContext.h>
#include <c10/cuda/CUDAStream.h>
extern THCState *state;
#define CHECK_CUDA(x) TORCH_CHECK(x.is_cuda(), #x, " must be a CUDAtensor ") #define CHECK_CUDA(x) TORCH_CHECK(x.is_cuda(), #x, " must be a CUDAtensor ")
#define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x, " must be contiguous ") #define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x, " must be contiguous ")

View File

@ -27,7 +27,7 @@
namespace tv { namespace tv {
#ifdef __NVCC__ #if defined(__NVCC__) || defined(__HIP__)
#define TV_HOST_DEVICE_INLINE __forceinline__ __device__ __host__ #define TV_HOST_DEVICE_INLINE __forceinline__ __device__ __host__
#define TV_DEVICE_INLINE __forceinline__ __device__ #define TV_DEVICE_INLINE __forceinline__ __device__
#define TV_HOST_DEVICE __device__ __host__ #define TV_HOST_DEVICE __device__ __host__

View File

@ -75,6 +75,13 @@ __device__ __forceinline__ static void reduceAdd(double *address, double val) {
atomicAdd(address, val); atomicAdd(address, val);
#endif #endif
} }
#elif defined(__HIP__)
__device__ __forceinline__ static void reduceAdd(float *address, float val) {
atomicAdd(address, val);
}
__device__ __forceinline__ static void reduceAdd(double *address, double val) {
atomicAdd(address, val);
}
#endif #endif
template <typename T> template <typename T>

View File

@ -21,7 +21,7 @@ std::vector<at::Tensor> dynamic_point_to_voxel_cpu(
const at::Tensor &points, const at::Tensor &voxel_mapping, const at::Tensor &points, const at::Tensor &voxel_mapping,
const std::vector<float> voxel_size, const std::vector<float> coors_range); const std::vector<float> voxel_size, const std::vector<float> coors_range);
#ifdef WITH_CUDA #if defined(WITH_CUDA) || defined(WITH_ROCM)
int hard_voxelize_gpu(const at::Tensor &points, at::Tensor &voxels, int hard_voxelize_gpu(const at::Tensor &points, at::Tensor &voxels,
at::Tensor &coors, at::Tensor &num_points_per_voxel, at::Tensor &coors, at::Tensor &num_points_per_voxel,
const std::vector<float> voxel_size, const std::vector<float> voxel_size,
@ -62,7 +62,7 @@ inline int hard_voxelize(const at::Tensor &points, at::Tensor &voxels,
const int max_points, const int max_voxels, const int max_points, const int max_voxels,
const int NDim = 3, const bool deterministic = true) { const int NDim = 3, const bool deterministic = true) {
if (points.device().is_cuda()) { if (points.device().is_cuda()) {
#ifdef WITH_CUDA #if defined(WITH_CUDA) || defined(WITH_ROCM)
if (deterministic) { if (deterministic) {
return hard_voxelize_gpu(points, voxels, coors, num_points_per_voxel, return hard_voxelize_gpu(points, voxels, coors, num_points_per_voxel,
voxel_size, coors_range, max_points, max_voxels, voxel_size, coors_range, max_points, max_voxels,
@ -85,7 +85,7 @@ inline void dynamic_voxelize(const at::Tensor &points, at::Tensor &coors,
const std::vector<float> coors_range, const std::vector<float> coors_range,
const int NDim = 3) { const int NDim = 3) {
if (points.device().is_cuda()) { if (points.device().is_cuda()) {
#ifdef WITH_CUDA #if defined(WITH_CUDA) || defined(WITH_ROCM)
return dynamic_voxelize_gpu(points, coors, voxel_size, coors_range, NDim); return dynamic_voxelize_gpu(points, coors, voxel_size, coors_range, NDim);
#else #else
AT_ERROR("Not compiled with GPU support"); AT_ERROR("Not compiled with GPU support");
@ -109,7 +109,7 @@ inline std::vector<torch::Tensor> dynamic_point_to_voxel_forward(const torch::Te
const torch::Tensor &coors, const torch::Tensor &coors,
const std::string &reduce_type) { const std::string &reduce_type) {
if (feats.device().is_cuda()) { if (feats.device().is_cuda()) {
#ifdef WITH_CUDA #if defined(WITH_CUDA) || defined(WITH_ROCM)
return dynamic_point_to_voxel_forward_gpu(feats, coors, convert_reduce_type(reduce_type)); return dynamic_point_to_voxel_forward_gpu(feats, coors, convert_reduce_type(reduce_type));
#else #else
TORCH_CHECK(false, "Not compiled with GPU support"); TORCH_CHECK(false, "Not compiled with GPU support");
@ -127,7 +127,7 @@ inline void dynamic_point_to_voxel_backward(torch::Tensor &grad_feats,
const torch::Tensor &reduce_count, const torch::Tensor &reduce_count,
const std::string &reduce_type) { const std::string &reduce_type) {
if (grad_feats.device().is_cuda()) { if (grad_feats.device().is_cuda()) {
#ifdef WITH_CUDA #if defined(WITH_CUDA) || defined(WITH_ROCM)
dynamic_point_to_voxel_backward_gpu( dynamic_point_to_voxel_backward_gpu(
grad_feats, grad_reduced_feats, feats, reduced_feats, coors_idx, reduce_count, grad_feats, grad_reduced_feats, feats, reduced_feats, coors_idx, reduce_count,
convert_reduce_type(reduce_type)); convert_reduce_type(reduce_type));

View File

@ -12,7 +12,7 @@ def make_cuda_ext(
define_macros = [] define_macros = []
extra_compile_args = {"cxx": [] + extra_args} extra_compile_args = {"cxx": [] + extra_args}
if torch.cuda.is_available() or os.getenv("FORCE_CUDA", "0") == "1": if (torch.cuda.is_available() and torch.version.cuda is not None) or os.getenv("FORCE_CUDA", "0") == "1":
define_macros += [("WITH_CUDA", None)] define_macros += [("WITH_CUDA", None)]
extension = CUDAExtension extension = CUDAExtension
extra_compile_args["nvcc"] = extra_args + [ extra_compile_args["nvcc"] = extra_args + [
@ -25,6 +25,15 @@ def make_cuda_ext(
"-gencode=arch=compute_86,code=sm_86", "-gencode=arch=compute_86,code=sm_86",
] ]
sources += sources_cuda sources += sources_cuda
elif (torch.cuda.is_available() and torch.version.hip is not None) or os.getenv("FORCE_ROCM", "0") == 1:
define_macros += [("WITH_ROCM", None)]
extension = CUDAExtension
extra_compile_args["hipcc"] = extra_args + [
"-D__HIP_NO_HALF_OPERATORS__",
"-D__HIP_NO_HALF_CONVERSIONS__",
"-D__HIP_NO_HALF2_OPERATORS__",
]
sources += sources_cuda
else: else:
print("Compiling {} without CUDA".format(name)) print("Compiling {} without CUDA".format(name))
extension = CppExtension extension = CppExtension
@ -66,20 +75,20 @@ if __name__ == "__main__":
], ],
sources=[ sources=[
"src/all.cc", "src/all.cc",
"src/reordering.cc", "src/reordering_cpu.cc",
"src/reordering_cuda.cu", "src/reordering_cuda.cu",
"src/indice.cc", "src/indice_cpu.cc",
"src/indice_cuda.cu", "src/indice_cuda.cu",
"src/maxpool.cc", "src/maxpool_cpu.cc",
"src/maxpool_cuda.cu", "src/maxpool_cuda.cu",
], ],
extra_args=["-w", "-std=c++14"], extra_args=["-w", "-std=c++17"],
), ),
make_cuda_ext( make_cuda_ext(
name="bev_pool_ext", name="bev_pool_ext",
module="mmdet3d.ops.bev_pool", module="mmdet3d.ops.bev_pool",
sources=[ sources=[
"src/bev_pool.cpp", "src/bev_pool_cpu.cpp",
"src/bev_pool_cuda.cu", "src/bev_pool_cuda.cu",
], ],
), ),
@ -117,13 +126,13 @@ if __name__ == "__main__":
make_cuda_ext( make_cuda_ext(
name="ball_query_ext", name="ball_query_ext",
module="mmdet3d.ops.ball_query", module="mmdet3d.ops.ball_query",
sources=["src/ball_query.cpp"], sources=["src/ball_query_cpu.cpp"],
sources_cuda=["src/ball_query_cuda.cu"], sources_cuda=["src/ball_query_cuda.cu"],
), ),
make_cuda_ext( make_cuda_ext(
name="knn_ext", name="knn_ext",
module="mmdet3d.ops.knn", module="mmdet3d.ops.knn",
sources=["src/knn.cpp"], sources=["src/knn_cpu.cpp"],
sources_cuda=["src/knn_cuda.cu"], sources_cuda=["src/knn_cuda.cu"],
), ),
make_cuda_ext( make_cuda_ext(
@ -135,7 +144,7 @@ if __name__ == "__main__":
make_cuda_ext( make_cuda_ext(
name="group_points_ext", name="group_points_ext",
module="mmdet3d.ops.group_points", module="mmdet3d.ops.group_points",
sources=["src/group_points.cpp"], sources=["src/group_points_cpu.cpp"],
sources_cuda=["src/group_points_cuda.cu"], sources_cuda=["src/group_points_cuda.cu"],
), ),
make_cuda_ext( make_cuda_ext(
@ -147,13 +156,13 @@ if __name__ == "__main__":
make_cuda_ext( make_cuda_ext(
name="furthest_point_sample_ext", name="furthest_point_sample_ext",
module="mmdet3d.ops.furthest_point_sample", module="mmdet3d.ops.furthest_point_sample",
sources=["src/furthest_point_sample.cpp"], sources=["src/furthest_point_sample_cpu.cpp"],
sources_cuda=["src/furthest_point_sample_cuda.cu"], sources_cuda=["src/furthest_point_sample_cuda.cu"],
), ),
make_cuda_ext( make_cuda_ext(
name="gather_points_ext", name="gather_points_ext",
module="mmdet3d.ops.gather_points", module="mmdet3d.ops.gather_points",
sources=["src/gather_points.cpp"], sources=["src/gather_points_cpu.cpp"],
sources_cuda=["src/gather_points_cuda.cu"], sources_cuda=["src/gather_points_cuda.cu"],
), ),
], ],