diff --git a/CMakeLists.txt b/CMakeLists.txt index 81ca559d530..e6b97786888 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -32,9 +32,11 @@ file(GLOB HEADERS torchvision/csrc/*.h) # Image extension file(GLOB IMAGE_HEADERS torchvision/csrc/cpu/image/*.h) file(GLOB IMAGE_SOURCES torchvision/csrc/cpu/image/*.cpp) -file(GLOB OPERATOR_SOURCES torchvision/csrc/cpu/*.h torchvision/csrc/cpu/*.cpp ${IMAGE_HEADERS} ${IMAGE_SOURCES} ${HEADERS} torchvision/csrc/*.cpp) +file(GLOB OPERATOR_HEADERS torchvision/csrc/cpu/*.h) +file(GLOB OPERATOR_SOURCES ${OPERATOR_HEADERS} torchvision/csrc/cpu/*.cpp ${IMAGE_HEADERS} ${IMAGE_SOURCES} ${HEADERS} torchvision/csrc/*.cpp) if(WITH_CUDA) - file(GLOB OPERATOR_SOURCES ${OPERATOR_SOURCES} torchvision/csrc/cuda/*.h torchvision/csrc/cuda/*.cu) + file(GLOB OPERATOR_HEADERS ${OPERATOR_HEADERS} torchvision/csrc/cuda/*.h) + file(GLOB OPERATOR_SOURCES ${OPERATOR_SOURCES} ${OPERATOR_HEADERS} torchvision/csrc/cuda/*.cu) endif() file(GLOB MODELS_HEADERS torchvision/csrc/models/*.h) file(GLOB MODELS_SOURCES torchvision/csrc/models/*.h torchvision/csrc/models/*.cpp) @@ -95,11 +97,11 @@ install(EXPORT TorchVisionTargets install(FILES ${HEADERS} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/${PROJECT_NAME}) install(FILES - torchvision/csrc/cpu/vision_cpu.h + ${OPERATOR_HEADERS} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/${PROJECT_NAME}/cpu) if(WITH_CUDA) install(FILES - torchvision/csrc/cuda/vision_cuda.h + ${OPERATOR_HEADERS} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/${PROJECT_NAME}/cuda) endif() install(FILES ${MODELS_HEADERS} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/${PROJECT_NAME}/models) diff --git a/setup.py b/setup.py index 82c93be87cd..0a363794da5 100644 --- a/setup.py +++ b/setup.py @@ -152,8 +152,8 @@ def get_extensions(): ) source_cuda = glob.glob(os.path.join(extensions_dir, 'hip', '*.hip')) # Copy over additional files - shutil.copy("torchvision/csrc/cuda/cuda_helpers.h", "torchvision/csrc/hip/cuda_helpers.h") - shutil.copy("torchvision/csrc/cuda/vision_cuda.h", "torchvision/csrc/hip/vision_cuda.h") + for file in glob.glob(r"torchvision/csrc/cuda/*.h"): + shutil.copy(file, "torchvision/csrc/hip") else: source_cuda = glob.glob(os.path.join(extensions_dir, 'cuda', '*.cu')) diff --git a/test/tracing/frcnn/test_frcnn_tracing.cpp b/test/tracing/frcnn/test_frcnn_tracing.cpp index a23b95cf88f..7b10aee3c89 100644 --- a/test/tracing/frcnn/test_frcnn_tracing.cpp +++ b/test/tracing/frcnn/test_frcnn_tracing.cpp @@ -1,14 +1,13 @@ #include #include #include -#include -#include +#include #include #ifdef _WIN32 // Windows only // This is necessary until operators are automatically registered on include -static auto _nms = &nms_cpu; +static auto _nms = &vision::ops::nms_cpu; #endif int main() { diff --git a/torchvision/csrc/autocast.h b/torchvision/csrc/autocast.h deleted file mode 100644 index 1f954464b72..00000000000 --- a/torchvision/csrc/autocast.h +++ /dev/null @@ -1,5 +0,0 @@ -#pragma once - -#if defined(WITH_CUDA) || defined(WITH_HIP) -#include -#endif diff --git a/torchvision/csrc/cpu/DeformConv_cpu.cpp b/torchvision/csrc/cpu/deform_conv2d_kernel.cpp similarity index 84% rename from torchvision/csrc/cpu/DeformConv_cpu.cpp rename to torchvision/csrc/cpu/deform_conv2d_kernel.cpp index 0212be55aa4..4ae2d0a02db 100644 --- a/torchvision/csrc/cpu/DeformConv_cpu.cpp +++ b/torchvision/csrc/cpu/deform_conv2d_kernel.cpp @@ -66,18 +66,17 @@ // modified from // https://github.com/open-mmlab/mmdetection/blob/master/mmdet/ops/dcn/src/deform_conv_cuda.cpp -#include -#include -#include +#include "deform_conv2d_kernel.h" -#include -#include -#include +namespace vision { +namespace ops { + +namespace { const int kMaxParallelImgs = 32; template -static scalar_t bilinear_interpolate( +scalar_t bilinear_interpolate( const scalar_t* in, int height, int width, @@ -116,7 +115,7 @@ static scalar_t bilinear_interpolate( } template -static void deformable_im2col_kernel( +void deformable_im2col_kernel( int n, const scalar_t* input, const scalar_t* offset, @@ -129,8 +128,8 @@ static void deformable_im2col_kernel( int pad_w, int stride_h, int stride_w, - int dil_h, - int dil_w, + int dilation_h, + int dilation_w, int batch_sz, int n_in_channels, int n_offset_grps, @@ -180,8 +179,10 @@ static void deformable_im2col_kernel( offset_ptr[offset_idx * (out_h * out_w) + out_y * out_w + out_x]; const scalar_t offset_w = offset_ptr [(offset_idx + 1) * (out_h * out_w) + out_y * out_w + out_x]; - const scalar_t y = (out_y * stride_h - pad_h) + i * dil_h + offset_h; - const scalar_t x = (out_x * stride_w - pad_w) + j * dil_w + offset_w; + const scalar_t y = + (out_y * stride_h - pad_h) + i * dilation_h + offset_h; + const scalar_t x = + (out_x * stride_w - pad_w) + j * dilation_w + offset_w; *columns_ptr = mask_value * bilinear_interpolate(input_ptr, height, width, y, x); columns_ptr += batch_sz * out_h * out_w; @@ -190,7 +191,7 @@ static void deformable_im2col_kernel( } } -static void deformable_im2col( +void deformable_im2col( const at::Tensor& input, const at::Tensor& data_offset, const at::Tensor& data_mask, @@ -203,8 +204,8 @@ static void deformable_im2col( int pad_w, int stride_h, int stride_w, - int dil_h, - int dil_w, + int dilation_h, + int dilation_w, int out_h, int out_w, int parallel_imgs, @@ -228,8 +229,8 @@ static void deformable_im2col( pad_w, stride_h, stride_w, - dil_h, - dil_w, + dilation_h, + dilation_w, parallel_imgs, n_in_channels, deformable_group, @@ -240,7 +241,7 @@ static void deformable_im2col( })); } -static int get_greatest_divisor_below_bound(int n, int bound) { +int get_greatest_divisor_below_bound(int n, int bound) { for (int k = bound; k > 1; --k) { if (n % k == 0) { return k; @@ -249,216 +250,8 @@ static int get_greatest_divisor_below_bound(int n, int bound) { return 1; } -at::Tensor DeformConv2d_forward_cpu( - const at::Tensor& input_param, - const at::Tensor& weight_param, - const at::Tensor& offset_param, - const at::Tensor& mask_param, - const at::Tensor& bias_param, - int64_t stride_h, - int64_t stride_w, - int64_t pad_h, - int64_t pad_w, - int64_t dil_h, - int64_t dil_w, - int64_t n_weight_grps, - int64_t n_offset_grps, - bool use_mask) { - at::Tensor input = input_param.contiguous(); - at::Tensor offset = offset_param.contiguous(); - at::Tensor weight = weight_param.contiguous(); - at::Tensor mask = mask_param.contiguous(); - at::Tensor bias = bias_param.contiguous(); - - TORCH_CHECK(input.ndimension() == 4); - TORCH_CHECK(offset.ndimension() == 4); - TORCH_CHECK(!use_mask || mask.ndimension() == 4); - TORCH_CHECK(weight.ndimension() == 4); - TORCH_CHECK(input.device().is_cpu(), "input must be a CPU tensor"); - - int batch_sz = input.size(0); - int n_in_channels = input.size(1); - int in_h = input.size(2); - int in_w = input.size(3); - - int n_parallel_imgs = - get_greatest_divisor_below_bound(batch_sz, kMaxParallelImgs); - - // Unpack shapes and args - int out_channels = weight.size(0); - int weight_h = weight.size(2); - int weight_w = weight.size(3); - - int ker_h = dil_h * (weight_h - 1) + 1; - int ker_w = dil_w * (weight_w - 1) + 1; - int out_h = ((in_h + 2 * pad_h - ker_h) / stride_h) + 1; - int out_w = ((in_w + 2 * pad_w - ker_w) / stride_w) + 1; - - TORCH_CHECK( - weight_h > 0 && weight_w > 0, - "weight_h: ", - weight_h, - " weight_w: ", - weight_w); - TORCH_CHECK( - stride_h > 0 && stride_w > 0, - "stride_h: ", - stride_h, - " stride_w: ", - stride_w); - TORCH_CHECK(pad_h >= 0 && pad_w >= 0, "pad_h: ", pad_h, " pad_w: ", pad_w); - TORCH_CHECK(dil_h > 0 && dil_w > 0, "dil_h: ", dil_h, " dil_w: ", dil_w); - - TORCH_CHECK(weight.size(1) * n_weight_grps == input.size(1)); - TORCH_CHECK(weight.size(0) % n_weight_grps == 0); - TORCH_CHECK( - (offset.size(1) == n_offset_grps * 2 * weight_h * weight_w), - "offset.shape[1] is not valid: got: ", - offset.size(1), - " expected: ", - n_offset_grps * 2 * weight_h * weight_w); - TORCH_CHECK( - (!use_mask || mask.size(1) == n_offset_grps * weight_h * weight_w), - "mask.shape[1] is not valid: got: ", - mask.size(1), - " expected: ", - n_offset_grps * weight_h * weight_w); - TORCH_CHECK(input.size(1) % n_offset_grps == 0); - - TORCH_CHECK( - (offset.size(0) == input.size(0)), "invalid batch size of offset"); - TORCH_CHECK( - (offset.size(2) == out_h && offset.size(3) == out_w), - "offset output dims: (", - offset.size(2), - ", ", - offset.size(3), - ") - ", - "computed output dims: (", - out_h, - ", ", - out_w, - ")"); - TORCH_CHECK((mask.size(0) == input.size(0)), "invalid batch size of mask"); - TORCH_CHECK( - (!use_mask || (mask.size(2) == out_h && mask.size(3) == out_w)), - "offset output dims: (", - mask.size(2), - ", ", - mask.size(3), - ") - ", - "computed output dims: (", - out_h, - ", ", - out_w, - ")"); - TORCH_CHECK( - out_h > 0 && out_w > 0, - "Calculated output size too small - out_h: ", - out_h, - " out_w: ", - out_w); - - auto out = at::zeros({batch_sz, out_channels, out_h, out_w}, input.options()); - if (batch_sz == 0) { - return out; - } - - // Separate batches into blocks - out = out.view({batch_sz / n_parallel_imgs, - n_parallel_imgs, - out_channels, - out_h, - out_w}); - input = input.view( - {batch_sz / n_parallel_imgs, n_parallel_imgs, n_in_channels, in_h, in_w}); - - offset = offset.view({batch_sz / n_parallel_imgs, - n_parallel_imgs, - n_offset_grps * 2 * weight_h * weight_w, - out_h, - out_w}); - - if (use_mask) { - mask = mask.view({batch_sz / n_parallel_imgs, - n_parallel_imgs, - n_offset_grps * weight_h * weight_w, - out_h, - out_w}); - } - - at::Tensor out_buf = at::zeros( - {batch_sz / n_parallel_imgs, - out_channels, - n_parallel_imgs * out_h, - out_w}, - out.options()); - - // Separate channels into convolution groups - out_buf = out_buf.view({out_buf.size(0), - n_weight_grps, - out_buf.size(1) / n_weight_grps, - out_buf.size(2), - out_buf.size(3)}); - weight = weight.view({n_weight_grps, - weight.size(0) / n_weight_grps, - weight.size(1), - weight.size(2), - weight.size(3)}); - - // Sample points and perform convolution - auto columns = at::zeros( - {n_in_channels * weight_h * weight_w, n_parallel_imgs * out_h * out_w}, - input.options()); - for (int b = 0; b < batch_sz / n_parallel_imgs; b++) { - deformable_im2col( - input[b], - offset[b], - mask[b], - n_in_channels, - in_h, - in_w, - weight_h, - weight_w, - pad_h, - pad_w, - stride_h, - stride_w, - dil_h, - dil_w, - out_h, - out_w, - n_parallel_imgs, - n_offset_grps, - use_mask, - columns); - - columns = columns.view( - {n_weight_grps, columns.size(0) / n_weight_grps, columns.size(1)}); - for (int g = 0; g < n_weight_grps; g++) { - out_buf[b][g] = out_buf[b][g] - .flatten(1) - .addmm_(weight[g].flatten(1), columns[g]) - .view_as(out_buf[b][g]); - } - columns = - columns.view({columns.size(0) * columns.size(1), columns.size(2)}); - } - - out_buf = out_buf.view({batch_sz / n_parallel_imgs, - out_channels, - n_parallel_imgs, - out_h, - out_w}); - out_buf.transpose_(1, 2); - out.copy_(out_buf); - out = out.view({batch_sz, out_channels, out_h, out_w}); - - return out + bias.view({1, out_channels, 1, 1}); -} - template -static void deformable_col2im_kernel( +void deformable_col2im_kernel( int n, const scalar_t* col, const scalar_t* offset, @@ -533,7 +326,7 @@ static void deformable_col2im_kernel( } } -static void compute_grad_input( +void compute_grad_input( const at::Tensor& columns, const at::Tensor& offset, const at::Tensor& mask, @@ -560,7 +353,7 @@ static void compute_grad_input( channels * weight_h * weight_w * out_h * out_w * parallel_imgs; AT_DISPATCH_FLOATING_TYPES_AND_HALF( - columns.scalar_type(), "deformable_col2im", ([&] { + columns.scalar_type(), "compute_grad_input", ([&] { deformable_col2im_kernel( num_kernels, columns.data_ptr(), @@ -587,7 +380,7 @@ static void compute_grad_input( } template -static scalar_t get_coordinate_weight( +scalar_t get_coordinate_weight( const scalar_t* im_data, int height, int width, @@ -620,7 +413,7 @@ static scalar_t get_coordinate_weight( } template -static void deformable_col2im_coord_kernel( +void deformable_col2im_coord_kernel( int n, const scalar_t* col, const scalar_t* im, @@ -732,7 +525,7 @@ static void deformable_col2im_coord_kernel( } } -static void compute_grad_offset_and_mask( +void compute_grad_offset_and_mask( const at::Tensor& columns, const at::Tensor& input, const at::Tensor& offset, @@ -761,7 +554,7 @@ static void compute_grad_offset_and_mask( out_h * out_w * 2 * weight_h * weight_w * n_offset_grps * parallel_imgs; AT_DISPATCH_FLOATING_TYPES_AND_HALF( - columns.scalar_type(), "deformable_col2im_coord", ([&] { + columns.scalar_type(), "compute_grad_offset_and_mask", ([&] { deformable_col2im_coord_kernel( num_kernels, columns.data_ptr(), @@ -790,8 +583,7 @@ static void compute_grad_offset_and_mask( })); } -static std::tuple -deform_conv2d_backward_input_cpu( +std::tuple backward_gradient_inputs( at::Tensor input, at::Tensor weight, at::Tensor offset, @@ -801,8 +593,8 @@ deform_conv2d_backward_input_cpu( int stride_w, int pad_h, int pad_w, - int dil_h, - int dil_w, + int dilation_h, + int dilation_w, int n_weight_grps, int n_offset_grps, int n_parallel_imgs, @@ -818,8 +610,10 @@ deform_conv2d_backward_input_cpu( int weight_h = weight.size(2); int weight_w = weight.size(3); - long out_h = (in_h + 2 * pad_h - (dil_h * (weight_h - 1) + 1)) / stride_h + 1; - long out_w = (in_w + 2 * pad_w - (dil_w * (weight_w - 1) + 1)) / stride_w + 1; + long out_h = + (in_h + 2 * pad_h - (dilation_h * (weight_h - 1) + 1)) / stride_h + 1; + long out_w = + (in_w + 2 * pad_w - (dilation_w * (weight_w - 1) + 1)) / stride_w + 1; auto grad_input = at::zeros_like(input); auto grad_offset = at::zeros_like(offset); @@ -903,8 +697,8 @@ deform_conv2d_backward_input_cpu( pad_w, stride_h, stride_w, - dil_h, - dil_w, + dilation_h, + dilation_w, n_parallel_imgs, n_offset_grps, use_mask, @@ -924,8 +718,8 @@ deform_conv2d_backward_input_cpu( pad_w, stride_h, stride_w, - dil_h, - dil_w, + dilation_h, + dilation_w, n_parallel_imgs, n_offset_grps, use_mask, @@ -944,7 +738,7 @@ deform_conv2d_backward_input_cpu( return std::make_tuple(grad_input, grad_offset, grad_mask); } -static at::Tensor deform_conv2d_backward_parameters_cpu( +at::Tensor backward_gradient_parameters( at::Tensor input, const at::Tensor& weight, at::Tensor offset, @@ -954,8 +748,8 @@ static at::Tensor deform_conv2d_backward_parameters_cpu( int stride_w, int pad_h, int pad_w, - int dil_h, - int dil_w, + int dilation_h, + int dilation_w, int n_weight_grps, int n_offset_grps, int n_parallel_imgs, @@ -1032,8 +826,8 @@ static at::Tensor deform_conv2d_backward_parameters_cpu( pad_w, stride_h, stride_w, - dil_h, - dil_w, + dilation_h, + dilation_w, out_h, out_w, n_parallel_imgs, @@ -1058,46 +852,263 @@ static at::Tensor deform_conv2d_backward_parameters_cpu( return grad_weight; } +} // namespace + +at::Tensor deform_conv2d_forward_cpu( + const at::Tensor& input, + const at::Tensor& weight, + const at::Tensor& offset, + const at::Tensor& mask, + const at::Tensor& bias, + int64_t stride_h, + int64_t stride_w, + int64_t pad_h, + int64_t pad_w, + int64_t dilation_h, + int64_t dilation_w, + int64_t n_weight_grps, + int64_t n_offset_grps, + bool use_mask) { + at::Tensor input_c = input.contiguous(); + at::Tensor offset_c = offset.contiguous(); + at::Tensor weight_c = weight.contiguous(); + at::Tensor mask_c = mask.contiguous(); + at::Tensor bias_c = bias.contiguous(); + + TORCH_CHECK(input_c.ndimension() == 4); + TORCH_CHECK(offset_c.ndimension() == 4); + TORCH_CHECK(!use_mask || mask_c.ndimension() == 4); + TORCH_CHECK(weight_c.ndimension() == 4); + TORCH_CHECK(input_c.device().is_cpu(), "input must be a CPU tensor"); + + int batch_sz = input_c.size(0); + int n_in_channels = input_c.size(1); + int in_h = input_c.size(2); + int in_w = input_c.size(3); + + int n_parallel_imgs = + get_greatest_divisor_below_bound(batch_sz, kMaxParallelImgs); + + // Unpack shapes and args + int out_channels = weight_c.size(0); + int weight_h = weight_c.size(2); + int weight_w = weight_c.size(3); + + int ker_h = dilation_h * (weight_h - 1) + 1; + int ker_w = dilation_w * (weight_w - 1) + 1; + int out_h = ((in_h + 2 * pad_h - ker_h) / stride_h) + 1; + int out_w = ((in_w + 2 * pad_w - ker_w) / stride_w) + 1; + + TORCH_CHECK( + weight_h > 0 && weight_w > 0, + "weight_h: ", + weight_h, + " weight_w: ", + weight_w); + TORCH_CHECK( + stride_h > 0 && stride_w > 0, + "stride_h: ", + stride_h, + " stride_w: ", + stride_w); + TORCH_CHECK(pad_h >= 0 && pad_w >= 0, "pad_h: ", pad_h, " pad_w: ", pad_w); + TORCH_CHECK( + dilation_h > 0 && dilation_w > 0, + "dilation_h: ", + dilation_h, + " dilation_w: ", + dilation_w); + + TORCH_CHECK(weight_c.size(1) * n_weight_grps == input_c.size(1)); + TORCH_CHECK(weight_c.size(0) % n_weight_grps == 0); + TORCH_CHECK( + (offset_c.size(1) == n_offset_grps * 2 * weight_h * weight_w), + "offset.shape[1] is not valid: got: ", + offset_c.size(1), + " expected: ", + n_offset_grps * 2 * weight_h * weight_w); + TORCH_CHECK( + (!use_mask || mask_c.size(1) == n_offset_grps * weight_h * weight_w), + "mask.shape[1] is not valid: got: ", + mask_c.size(1), + " expected: ", + n_offset_grps * weight_h * weight_w); + TORCH_CHECK(input_c.size(1) % n_offset_grps == 0); + + TORCH_CHECK( + (offset_c.size(0) == input_c.size(0)), "invalid batch size of offset"); + TORCH_CHECK( + (offset_c.size(2) == out_h && offset_c.size(3) == out_w), + "offset output dims: (", + offset_c.size(2), + ", ", + offset_c.size(3), + ") - ", + "computed output dims: (", + out_h, + ", ", + out_w, + ")"); + TORCH_CHECK( + (mask_c.size(0) == input_c.size(0)), "invalid batch size of mask"); + TORCH_CHECK( + (!use_mask || (mask_c.size(2) == out_h && mask_c.size(3) == out_w)), + "offset output dims: (", + mask_c.size(2), + ", ", + mask_c.size(3), + ") - ", + "computed output dims: (", + out_h, + ", ", + out_w, + ")"); + TORCH_CHECK( + out_h > 0 && out_w > 0, + "Calculated output size too small - out_h: ", + out_h, + " out_w: ", + out_w); + + auto out = + at::zeros({batch_sz, out_channels, out_h, out_w}, input_c.options()); + if (batch_sz == 0) { + return out; + } + + // Separate batches into blocks + out = out.view({batch_sz / n_parallel_imgs, + n_parallel_imgs, + out_channels, + out_h, + out_w}); + input_c = input_c.view( + {batch_sz / n_parallel_imgs, n_parallel_imgs, n_in_channels, in_h, in_w}); + + offset_c = offset_c.view({batch_sz / n_parallel_imgs, + n_parallel_imgs, + n_offset_grps * 2 * weight_h * weight_w, + out_h, + out_w}); + + if (use_mask) { + mask_c = mask_c.view({batch_sz / n_parallel_imgs, + n_parallel_imgs, + n_offset_grps * weight_h * weight_w, + out_h, + out_w}); + } + + at::Tensor out_buf = at::zeros( + {batch_sz / n_parallel_imgs, + out_channels, + n_parallel_imgs * out_h, + out_w}, + out.options()); + + // Separate channels into convolution groups + out_buf = out_buf.view({out_buf.size(0), + n_weight_grps, + out_buf.size(1) / n_weight_grps, + out_buf.size(2), + out_buf.size(3)}); + weight_c = weight_c.view({n_weight_grps, + weight_c.size(0) / n_weight_grps, + weight_c.size(1), + weight_c.size(2), + weight_c.size(3)}); + + // Sample points and perform convolution + auto columns = at::zeros( + {n_in_channels * weight_h * weight_w, n_parallel_imgs * out_h * out_w}, + input_c.options()); + for (int b = 0; b < batch_sz / n_parallel_imgs; b++) { + deformable_im2col( + input_c[b], + offset_c[b], + mask_c[b], + n_in_channels, + in_h, + in_w, + weight_h, + weight_w, + pad_h, + pad_w, + stride_h, + stride_w, + dilation_h, + dilation_w, + out_h, + out_w, + n_parallel_imgs, + n_offset_grps, + use_mask, + columns); + + columns = columns.view( + {n_weight_grps, columns.size(0) / n_weight_grps, columns.size(1)}); + for (int g = 0; g < n_weight_grps; g++) { + out_buf[b][g] = out_buf[b][g] + .flatten(1) + .addmm_(weight_c[g].flatten(1), columns[g]) + .view_as(out_buf[b][g]); + } + columns = + columns.view({columns.size(0) * columns.size(1), columns.size(2)}); + } + + out_buf = out_buf.view({batch_sz / n_parallel_imgs, + out_channels, + n_parallel_imgs, + out_h, + out_w}); + out_buf.transpose_(1, 2); + out.copy_(out_buf); + out = out.view({batch_sz, out_channels, out_h, out_w}); + + return out + bias_c.view({1, out_channels, 1, 1}); +} + std::tuple -DeformConv2d_backward_cpu( - const at::Tensor& grad_out_param, - const at::Tensor& input_param, - const at::Tensor& weight_param, - const at::Tensor& offset_param, - const at::Tensor& mask_param, - const at::Tensor& bias_param, +deform_conv2d_backward_cpu( + const at::Tensor& grad_out, + const at::Tensor& input, + const at::Tensor& weight, + const at::Tensor& offset, + const at::Tensor& mask, + const at::Tensor& bias, int64_t stride_h, int64_t stride_w, int64_t pad_h, int64_t pad_w, - int64_t dil_h, - int64_t dil_w, + int64_t dilation_h, + int64_t dilation_w, int64_t n_weight_grps, int64_t n_offset_grps, bool use_mask) { - at::Tensor grad_out = grad_out_param.contiguous(); - at::Tensor input = input_param.contiguous(); - at::Tensor weight = weight_param.contiguous(); - at::Tensor offset = offset_param.contiguous(); - at::Tensor mask = mask_param.contiguous(); - at::Tensor bias = bias_param.contiguous(); - - const int batch_sz = input.size(0); + at::Tensor grad_out_c = grad_out.contiguous(); + at::Tensor input_c = input.contiguous(); + at::Tensor weight_c = weight.contiguous(); + at::Tensor offset_c = offset.contiguous(); + at::Tensor mask_c = mask.contiguous(); + at::Tensor bias_c = bias.contiguous(); + + const int batch_sz = input_c.size(0); const int n_parallel_imgs = get_greatest_divisor_below_bound(batch_sz, kMaxParallelImgs); - auto grad_input_and_offset_and_mask = deform_conv2d_backward_input_cpu( - input, - weight, - offset, - mask, - grad_out, + auto grad_input_and_offset_and_mask = backward_gradient_inputs( + input_c, + weight_c, + offset_c, + mask_c, + grad_out_c, stride_h, stride_w, pad_h, pad_w, - dil_h, - dil_w, + dilation_h, + dilation_w, n_weight_grps, n_offset_grps, n_parallel_imgs, @@ -1107,25 +1118,28 @@ DeformConv2d_backward_cpu( auto grad_offset = std::get<1>(grad_input_and_offset_and_mask); auto grad_mask = std::get<2>(grad_input_and_offset_and_mask); - auto grad_weight = deform_conv2d_backward_parameters_cpu( - input, - weight, - offset, - mask, - grad_out, + auto grad_weight = backward_gradient_parameters( + input_c, + weight_c, + offset_c, + mask_c, + grad_out_c, stride_h, stride_w, pad_h, pad_w, - dil_h, - dil_w, + dilation_h, + dilation_w, n_weight_grps, n_offset_grps, n_parallel_imgs, use_mask); - auto grad_bias = at::ones_like(bias) * grad_out.sum({0, 2, 3}); + auto grad_bias = at::ones_like(bias_c) * grad_out_c.sum({0, 2, 3}); return std::make_tuple( grad_input, grad_weight, grad_offset, grad_mask, grad_bias); } + +} // namespace ops +} // namespace vision diff --git a/torchvision/csrc/cpu/deform_conv2d_kernel.h b/torchvision/csrc/cpu/deform_conv2d_kernel.h new file mode 100644 index 00000000000..2a49bad8304 --- /dev/null +++ b/torchvision/csrc/cpu/deform_conv2d_kernel.h @@ -0,0 +1,45 @@ +#pragma once + +#include +#include "../macros.h" + +namespace vision { +namespace ops { + +VISION_API at::Tensor deform_conv2d_forward_cpu( + const at::Tensor& input, + const at::Tensor& weight, + const at::Tensor& offset, + const at::Tensor& mask, + const at::Tensor& bias, + int64_t stride_h, + int64_t stride_w, + int64_t pad_h, + int64_t pad_w, + int64_t dilation_h, + int64_t dilation_w, + int64_t n_weight_grps, + int64_t n_offset_grps, + bool use_mask); + +VISION_API std:: + tuple + deform_conv2d_backward_cpu( + const at::Tensor& grad_out, + const at::Tensor& input, + const at::Tensor& weight, + const at::Tensor& offset, + const at::Tensor& mask, + const at::Tensor& bias, + int64_t stride_h, + int64_t stride_w, + int64_t pad_h, + int64_t pad_w, + int64_t dilation_h, + int64_t dilation_w, + int64_t n_weight_grps, + int64_t n_offset_grps, + bool use_mask); + +} // namespace ops +} // namespace vision diff --git a/torchvision/csrc/cpu/nms_cpu.cpp b/torchvision/csrc/cpu/nms_kernel.cpp similarity index 90% rename from torchvision/csrc/cpu/nms_cpu.cpp rename to torchvision/csrc/cpu/nms_kernel.cpp index 00a4c61db7a..a77a6906870 100644 --- a/torchvision/csrc/cpu/nms_cpu.cpp +++ b/torchvision/csrc/cpu/nms_kernel.cpp @@ -1,7 +1,12 @@ -#include "vision_cpu.h" +#include "nms_kernel.h" + +namespace vision { +namespace ops { + +namespace { template -at::Tensor nms_cpu_kernel( +at::Tensor nms_kernel_impl( const at::Tensor& dets, const at::Tensor& scores, double iou_threshold) { @@ -69,6 +74,8 @@ at::Tensor nms_cpu_kernel( return keep_t.narrow(/*dim=*/0, /*start=*/0, /*length=*/num_to_keep); } +} // namespace + at::Tensor nms_cpu( const at::Tensor& dets, const at::Tensor& scores, @@ -94,8 +101,11 @@ at::Tensor nms_cpu( auto result = at::empty({0}, dets.options()); - AT_DISPATCH_FLOATING_TYPES(dets.scalar_type(), "nms", [&] { - result = nms_cpu_kernel(dets, scores, iou_threshold); + AT_DISPATCH_FLOATING_TYPES(dets.scalar_type(), "nms_cpu", [&] { + result = nms_kernel_impl(dets, scores, iou_threshold); }); return result; } + +} // namespace ops +} // namespace vision diff --git a/torchvision/csrc/cpu/nms_kernel.h b/torchvision/csrc/cpu/nms_kernel.h new file mode 100644 index 00000000000..1fdcaf3d3f9 --- /dev/null +++ b/torchvision/csrc/cpu/nms_kernel.h @@ -0,0 +1,15 @@ +#pragma once + +#include +#include "../macros.h" + +namespace vision { +namespace ops { + +VISION_API at::Tensor nms_cpu( + const at::Tensor& dets, + const at::Tensor& scores, + double iou_threshold); + +} // namespace ops +} // namespace vision diff --git a/torchvision/csrc/cpu/PSROIAlign_cpu.cpp b/torchvision/csrc/cpu/ps_roi_align_kernel.cpp similarity index 94% rename from torchvision/csrc/cpu/PSROIAlign_cpu.cpp rename to torchvision/csrc/cpu/ps_roi_align_kernel.cpp index 899dbb208b6..5abe4a41477 100644 --- a/torchvision/csrc/cpu/PSROIAlign_cpu.cpp +++ b/torchvision/csrc/cpu/ps_roi_align_kernel.cpp @@ -1,6 +1,9 @@ -#include -#include -#include +#include "ps_roi_align_kernel.h" + +namespace vision { +namespace ops { + +namespace { template T bilinear_interpolate( @@ -57,7 +60,7 @@ T bilinear_interpolate( } template -void PSROIAlignForwardCPU( +void ps_roi_align_forward_kernel_impl( int nthreads, const T* input, const T spatial_scale, @@ -202,7 +205,7 @@ inline void add(T* address, const T& val) { } template -void PSROIAlignBackwardCPU( +void ps_roi_align_backward_kernel_impl( int nthreads, const T* grad_output, const int* channel_mapping, @@ -298,7 +301,9 @@ void PSROIAlignBackwardCPU( } } -std::tuple PSROIAlign_forward_cpu( +} // namespace + +std::tuple ps_roi_align_forward_cpu( const at::Tensor& input, const at::Tensor& rois, double spatial_scale, @@ -313,7 +318,7 @@ std::tuple PSROIAlign_forward_cpu( at::TensorArg input_t{input, "input", 1}, rois_t{rois, "rois", 2}; - at::CheckedFrom c = "PSROIAlign_forward_cpu"; + at::CheckedFrom c = "ps_roi_align_forward_cpu"; at::checkAllSameType(c, {input_t, rois_t}); int num_rois = rois.size(0); @@ -338,8 +343,8 @@ std::tuple PSROIAlign_forward_cpu( auto input_ = input.contiguous(), rois_ = rois.contiguous(); AT_DISPATCH_FLOATING_TYPES_AND_HALF( - input.scalar_type(), "PSROIAlign_forward", [&] { - PSROIAlignForwardCPU( + input.scalar_type(), "ps_roi_align_forward_cpu", [&] { + ps_roi_align_forward_kernel_impl( output_size, input_.data_ptr(), spatial_scale, @@ -357,7 +362,7 @@ std::tuple PSROIAlign_forward_cpu( return std::make_tuple(output, channel_mapping); } -at::Tensor PSROIAlign_backward_cpu( +at::Tensor ps_roi_align_backward_cpu( const at::Tensor& grad, const at::Tensor& rois, const at::Tensor& channel_mapping, @@ -379,7 +384,7 @@ at::Tensor PSROIAlign_backward_cpu( at::TensorArg grad_t{grad, "grad", 1}, rois_t{rois, "rois", 2}, channel_mapping_t{channel_mapping, "channel_mapping", 3}; - at::CheckedFrom c = "PSROIAlign_backward_cpu"; + at::CheckedFrom c = "ps_roi_align_backward_cpu"; at::checkAllSameType(c, {grad_t, rois_t}); auto num_rois = rois.size(0); @@ -395,8 +400,8 @@ at::Tensor PSROIAlign_backward_cpu( auto grad_ = grad.contiguous(), rois_ = rois.contiguous(); AT_DISPATCH_FLOATING_TYPES_AND_HALF( - grad.scalar_type(), "PSROIAlign_backward", [&] { - PSROIAlignBackwardCPU( + grad.scalar_type(), "ps_roi_align_backward_cpu", [&] { + ps_roi_align_backward_kernel_impl( grad.numel(), grad_.data_ptr(), channel_mapping.data_ptr(), @@ -414,3 +419,6 @@ at::Tensor PSROIAlign_backward_cpu( }); return grad_input; } + +} // namespace ops +} // namespace vision diff --git a/torchvision/csrc/cpu/ps_roi_align_kernel.h b/torchvision/csrc/cpu/ps_roi_align_kernel.h new file mode 100644 index 00000000000..a4bea77853b --- /dev/null +++ b/torchvision/csrc/cpu/ps_roi_align_kernel.h @@ -0,0 +1,31 @@ +#pragma once + +#include +#include "../macros.h" + +namespace vision { +namespace ops { + +VISION_API std::tuple ps_roi_align_forward_cpu( + const at::Tensor& input, + const at::Tensor& rois, + double spatial_scale, + int64_t pooled_height, + int64_t pooled_width, + int64_t sampling_ratio); + +VISION_API at::Tensor ps_roi_align_backward_cpu( + const at::Tensor& grad, + const at::Tensor& rois, + const at::Tensor& channel_mapping, + double spatial_scale, + int64_t pooled_height, + int64_t pooled_width, + int64_t sampling_ratio, + int64_t batch_size, + int64_t channels, + int64_t height, + int64_t width); + +} // namespace ops +} // namespace vision diff --git a/torchvision/csrc/cpu/PSROIPool_cpu.cpp b/torchvision/csrc/cpu/ps_roi_pool_kernel.cpp similarity index 92% rename from torchvision/csrc/cpu/PSROIPool_cpu.cpp rename to torchvision/csrc/cpu/ps_roi_pool_kernel.cpp index c6e0a64cac3..425b4c68f1a 100644 --- a/torchvision/csrc/cpu/PSROIPool_cpu.cpp +++ b/torchvision/csrc/cpu/ps_roi_pool_kernel.cpp @@ -1,7 +1,9 @@ -#include -#include -#include -#include +#include "ps_roi_pool_kernel.h" + +namespace vision { +namespace ops { + +namespace { template inline void add(T* address, const T& val) { @@ -9,7 +11,7 @@ inline void add(T* address, const T& val) { } template -void PSROIPoolForward( +void ps_roi_pool_forward_kernel_impl( const T* input, const T spatial_scale, int channels, @@ -79,7 +81,7 @@ void PSROIPoolForward( } template -void PSROIPoolBackward( +void ps_roi_pool_backward_kernel_impl( const T* grad_output, const int* channel_mapping, int num_rois, @@ -143,7 +145,9 @@ void PSROIPoolBackward( } } -std::tuple PSROIPool_forward_cpu( +} // namespace + +std::tuple ps_roi_pool_forward_cpu( const at::Tensor& input, const at::Tensor& rois, double spatial_scale, @@ -157,7 +161,7 @@ std::tuple PSROIPool_forward_cpu( at::TensorArg input_t{input, "input", 1}, rois_t{rois, "rois", 2}; - at::CheckedFrom c = "PSROIPool_forward_cpu"; + at::CheckedFrom c = "ps_roi_pool_forward_cpu"; at::checkAllSameType(c, {input_t, rois_t}); int num_rois = rois.size(0); @@ -182,8 +186,8 @@ std::tuple PSROIPool_forward_cpu( auto input_ = input.contiguous(), rois_ = rois.contiguous(); AT_DISPATCH_FLOATING_TYPES_AND_HALF( - input.scalar_type(), "PSROIPool_forward", [&] { - PSROIPoolForward( + input.scalar_type(), "ps_roi_pool_forward_cpu", [&] { + ps_roi_pool_forward_kernel_impl( input_.data_ptr(), spatial_scale, channels, @@ -200,7 +204,7 @@ std::tuple PSROIPool_forward_cpu( return std::make_tuple(output, channel_mapping); } -at::Tensor PSROIPool_backward_cpu( +at::Tensor ps_roi_pool_backward_cpu( const at::Tensor& grad, const at::Tensor& rois, const at::Tensor& channel_mapping, @@ -221,7 +225,7 @@ at::Tensor PSROIPool_backward_cpu( at::TensorArg grad_t{grad, "grad", 1}, rois_t{rois, "rois", 2}, channel_mapping_t{channel_mapping, "channel_mapping", 3}; - at::CheckedFrom c = "PSROIPool_backward_cpu"; + at::CheckedFrom c = "ps_roi_pool_backward_cpu"; at::checkAllSameType(c, {grad_t, rois_t}); auto num_rois = rois.size(0); @@ -237,8 +241,8 @@ at::Tensor PSROIPool_backward_cpu( auto grad_ = grad.contiguous(), rois_ = rois.contiguous(); AT_DISPATCH_FLOATING_TYPES_AND_HALF( - grad.scalar_type(), "PSROIPool_backward", [&] { - PSROIPoolBackward( + grad.scalar_type(), "ps_roi_pool_backward_cpu", [&] { + ps_roi_pool_backward_kernel_impl( grad_.data_ptr(), channel_mapping.data_ptr(), num_rois, @@ -254,3 +258,6 @@ at::Tensor PSROIPool_backward_cpu( }); return grad_input; } + +} // namespace ops +} // namespace vision diff --git a/torchvision/csrc/cpu/ps_roi_pool_kernel.h b/torchvision/csrc/cpu/ps_roi_pool_kernel.h new file mode 100644 index 00000000000..2cefe39e11e --- /dev/null +++ b/torchvision/csrc/cpu/ps_roi_pool_kernel.h @@ -0,0 +1,29 @@ +#pragma once + +#include +#include "../macros.h" + +namespace vision { +namespace ops { + +VISION_API std::tuple ps_roi_pool_forward_cpu( + const at::Tensor& input, + const at::Tensor& rois, + double spatial_scale, + int64_t pooled_height, + int64_t pooled_width); + +VISION_API at::Tensor ps_roi_pool_backward_cpu( + const at::Tensor& grad, + const at::Tensor& rois, + const at::Tensor& channel_mapping, + double spatial_scale, + int64_t pooled_height, + int64_t pooled_width, + int64_t batch_size, + int64_t channels, + int64_t height, + int64_t width); + +} // namespace ops +} // namespace vision diff --git a/torchvision/csrc/cpu/ROIAlign_cpu.cpp b/torchvision/csrc/cpu/roi_align_kernel.cpp similarity index 95% rename from torchvision/csrc/cpu/ROIAlign_cpu.cpp rename to torchvision/csrc/cpu/roi_align_kernel.cpp index 10ebd8158cc..cbb75f2c474 100644 --- a/torchvision/csrc/cpu/ROIAlign_cpu.cpp +++ b/torchvision/csrc/cpu/roi_align_kernel.cpp @@ -1,5 +1,9 @@ -#include -#include "vision_cpu.h" +#include "roi_align_kernel.h" + +namespace vision { +namespace ops { + +namespace { // implementation taken from Caffe2 template @@ -111,7 +115,7 @@ void pre_calc_for_bilinear_interpolate( } template -void ROIAlignForward( +void roi_align_forward_kernel_impl( int nthreads, const T* input, const T& spatial_scale, @@ -277,7 +281,7 @@ inline void add(T* address, const T& val) { } template -void ROIAlignBackward( +void roi_align_backward_kernel_impl( int nthreads, const T* grad_output, const T& spatial_scale, @@ -382,9 +386,11 @@ void ROIAlignBackward( } // ix } // iy } // for -} // ROIAlignBackward +} -at::Tensor ROIAlign_forward_cpu( +} // namespace + +at::Tensor roi_align_forward_cpu( const at::Tensor& input, const at::Tensor& rois, double spatial_scale, @@ -398,7 +404,7 @@ at::Tensor ROIAlign_forward_cpu( at::TensorArg input_t{input, "input", 1}, rois_t{rois, "rois", 2}; - at::CheckedFrom c = "ROIAlign_forward_cpu"; + at::CheckedFrom c = "roi_align_forward_cpu"; at::checkAllSameType(c, {input_t, rois_t}); auto num_rois = rois.size(0); @@ -416,8 +422,8 @@ at::Tensor ROIAlign_forward_cpu( auto input_ = input.contiguous(), rois_ = rois.contiguous(); AT_DISPATCH_FLOATING_TYPES_AND_HALF( - input.scalar_type(), "ROIAlign_forward", [&] { - ROIAlignForward( + input.scalar_type(), "roi_align_forward_cpu", [&] { + roi_align_forward_kernel_impl( output_size, input_.data_ptr(), spatial_scale, @@ -434,7 +440,7 @@ at::Tensor ROIAlign_forward_cpu( return output; } -at::Tensor ROIAlign_backward_cpu( +at::Tensor roi_align_backward_cpu( const at::Tensor& grad, const at::Tensor& rois, double spatial_scale, @@ -451,7 +457,7 @@ at::Tensor ROIAlign_backward_cpu( at::TensorArg grad_t{grad, "grad", 1}, rois_t{rois, "rois", 2}; - at::CheckedFrom c = "ROIAlign_backward_cpu"; + at::CheckedFrom c = "roi_align_backward_cpu"; at::checkAllSameType(c, {grad_t, rois_t}); at::Tensor grad_input = @@ -470,8 +476,8 @@ at::Tensor ROIAlign_backward_cpu( auto rois_ = rois.contiguous(); AT_DISPATCH_FLOATING_TYPES_AND_HALF( - grad.scalar_type(), "ROIAlign_forward", [&] { - ROIAlignBackward( + grad.scalar_type(), "roi_align_backward_cpu", [&] { + roi_align_backward_kernel_impl( grad.numel(), grad.data_ptr(), spatial_scale, @@ -491,3 +497,6 @@ at::Tensor ROIAlign_backward_cpu( }); return grad_input; } + +} // namespace ops +} // namespace vision diff --git a/torchvision/csrc/cpu/roi_align_kernel.h b/torchvision/csrc/cpu/roi_align_kernel.h new file mode 100644 index 00000000000..2e7813c261c --- /dev/null +++ b/torchvision/csrc/cpu/roi_align_kernel.h @@ -0,0 +1,32 @@ +#pragma once + +#include +#include "../macros.h" + +namespace vision { +namespace ops { + +VISION_API at::Tensor roi_align_forward_cpu( + const at::Tensor& input, + const at::Tensor& rois, + double spatial_scale, + int64_t pooled_height, + int64_t pooled_width, + int64_t sampling_ratio, + bool aligned); + +VISION_API at::Tensor roi_align_backward_cpu( + const at::Tensor& grad, + const at::Tensor& rois, + double spatial_scale, + int64_t pooled_height, + int64_t pooled_width, + int64_t batch_size, + int64_t channels, + int64_t height, + int64_t width, + int64_t sampling_ratio, + bool aligned); + +} // namespace ops +} // namespace vision diff --git a/torchvision/csrc/cpu/ROIPool_cpu.cpp b/torchvision/csrc/cpu/roi_pool_kernel.cpp similarity index 91% rename from torchvision/csrc/cpu/ROIPool_cpu.cpp rename to torchvision/csrc/cpu/roi_pool_kernel.cpp index 34da4f1d1cc..375b722684e 100644 --- a/torchvision/csrc/cpu/ROIPool_cpu.cpp +++ b/torchvision/csrc/cpu/roi_pool_kernel.cpp @@ -1,7 +1,11 @@ -#include -#include -#include -#include +#include + +#include "roi_pool_kernel.h" + +namespace vision { +namespace ops { + +namespace { template inline void add(T* address, const T& val) { @@ -9,7 +13,7 @@ inline void add(T* address, const T& val) { } template -void RoIPoolForward( +void roi_pool_forward_kernel_impl( const T* input, const T spatial_scale, int channels, @@ -78,7 +82,7 @@ void RoIPoolForward( } template -void RoIPoolBackward( +void roi_pool_backward_kernel_impl( const T* grad_output, const int* argmax_data, int num_rois, @@ -120,7 +124,9 @@ void RoIPoolBackward( } // num_rois } -std::tuple ROIPool_forward_cpu( +} // namespace + +std::tuple roi_pool_forward_cpu( const at::Tensor& input, const at::Tensor& rois, double spatial_scale, @@ -131,7 +137,7 @@ std::tuple ROIPool_forward_cpu( at::TensorArg input_t{input, "input", 1}, rois_t{rois, "rois", 2}; - at::CheckedFrom c = "ROIPool_forward_cpu"; + at::CheckedFrom c = "roi_pool_forward_cpu"; at::checkAllSameType(c, {input_t, rois_t}); int num_rois = rois.size(0); @@ -151,8 +157,8 @@ std::tuple ROIPool_forward_cpu( auto input_ = input.contiguous(), rois_ = rois.contiguous(); AT_DISPATCH_FLOATING_TYPES_AND_HALF( - input.scalar_type(), "ROIPool_forward", [&] { - RoIPoolForward( + input.scalar_type(), "roi_pool_forward_cpu", [&] { + roi_pool_forward_kernel_impl( input_.data_ptr(), spatial_scale, channels, @@ -168,7 +174,7 @@ std::tuple ROIPool_forward_cpu( return std::make_tuple(output, argmax); } -at::Tensor ROIPool_backward_cpu( +at::Tensor roi_pool_backward_cpu( const at::Tensor& grad, const at::Tensor& rois, const at::Tensor& argmax, @@ -188,7 +194,7 @@ at::Tensor ROIPool_backward_cpu( at::TensorArg grad_t{grad, "grad", 1}, rois_t{rois, "rois", 2}; - at::CheckedFrom c = "ROIPool_backward_cpu"; + at::CheckedFrom c = "roi_pool_backward_cpu"; at::checkAllSameType(c, {grad_t, rois_t}); auto num_rois = rois.size(0); @@ -209,8 +215,8 @@ at::Tensor ROIPool_backward_cpu( auto rois_ = rois.contiguous(); AT_DISPATCH_FLOATING_TYPES_AND_HALF( - grad.scalar_type(), "ROIPool_backward", [&] { - RoIPoolBackward( + grad.scalar_type(), "roi_pool_backward_cpu", [&] { + roi_pool_backward_kernel_impl( grad.data_ptr(), argmax.data_ptr(), num_rois, @@ -228,3 +234,6 @@ at::Tensor ROIPool_backward_cpu( }); return grad_input; } + +} // namespace ops +} // namespace vision diff --git a/torchvision/csrc/cpu/roi_pool_kernel.h b/torchvision/csrc/cpu/roi_pool_kernel.h new file mode 100644 index 00000000000..33d029cf31a --- /dev/null +++ b/torchvision/csrc/cpu/roi_pool_kernel.h @@ -0,0 +1,29 @@ +#pragma once + +#include +#include "../macros.h" + +namespace vision { +namespace ops { + +VISION_API std::tuple roi_pool_forward_cpu( + const at::Tensor& input, + const at::Tensor& rois, + double spatial_scale, + int64_t pooled_height, + int64_t pooled_width); + +VISION_API at::Tensor roi_pool_backward_cpu( + const at::Tensor& grad, + const at::Tensor& rois, + const at::Tensor& argmax, + double spatial_scale, + int64_t pooled_height, + int64_t pooled_width, + int64_t batch_size, + int64_t channels, + int64_t height, + int64_t width); + +} // namespace ops +} // namespace vision diff --git a/torchvision/csrc/cpu/video/register.cpp b/torchvision/csrc/cpu/video/register.cpp index a88615987bf..08902d427b0 100644 --- a/torchvision/csrc/cpu/video/register.cpp +++ b/torchvision/csrc/cpu/video/register.cpp @@ -1,6 +1,3 @@ -#ifndef REGISTER_H -#define REGISTER_H - #include "Video.h" namespace { @@ -15,4 +12,3 @@ static auto registerVideo = .def("next", &Video::Next); } // namespace -#endif diff --git a/torchvision/csrc/cpu/vision_cpu.h b/torchvision/csrc/cpu/vision_cpu.h deleted file mode 100644 index d5bfcc0de24..00000000000 --- a/torchvision/csrc/cpu/vision_cpu.h +++ /dev/null @@ -1,124 +0,0 @@ -#pragma once -#include -#include "../macros.h" - -VISION_API at::Tensor DeformConv2d_forward_cpu( - const at::Tensor& input, - const at::Tensor& weight, - const at::Tensor& offset, - const at::Tensor& mask, - const at::Tensor& bias, - int64_t stride_h, - int64_t stride_w, - int64_t pad_h, - int64_t pad_w, - int64_t dilation_h, - int64_t dilation_w, - int64_t groups, - int64_t deformable_groups, - bool use_mask); - -VISION_API std:: - tuple - DeformConv2d_backward_cpu( - const at::Tensor& grad_out, - const at::Tensor& input, - const at::Tensor& weight, - const at::Tensor& offset, - const at::Tensor& mask, - const at::Tensor& bias, - int64_t stride_h, - int64_t stride_w, - int64_t pad_h, - int64_t pad_w, - int64_t dilation_h, - int64_t dilation_w, - int64_t groups, - int64_t deformable_groups, - bool use_mask); - -VISION_API at::Tensor nms_cpu( - const at::Tensor& dets, - const at::Tensor& scores, - double iou_threshold); - -VISION_API std::tuple PSROIAlign_forward_cpu( - const at::Tensor& input, - const at::Tensor& rois, - double spatial_scale, - int64_t pooled_height, - int64_t pooled_width, - int64_t sampling_ratio); - -VISION_API at::Tensor PSROIAlign_backward_cpu( - const at::Tensor& grad, - const at::Tensor& rois, - const at::Tensor& channel_mapping, - double spatial_scale, - int64_t pooled_height, - int64_t pooled_width, - int64_t sampling_ratio, - int64_t batch_size, - int64_t channels, - int64_t height, - int64_t width); - -VISION_API std::tuple PSROIPool_forward_cpu( - const at::Tensor& input, - const at::Tensor& rois, - double spatial_scale, - int64_t pooled_height, - int64_t pooled_width); - -VISION_API at::Tensor PSROIPool_backward_cpu( - const at::Tensor& grad, - const at::Tensor& rois, - const at::Tensor& channel_mapping, - double spatial_scale, - int64_t pooled_height, - int64_t pooled_width, - int64_t batch_size, - int64_t channels, - int64_t height, - int64_t width); - -VISION_API at::Tensor ROIAlign_forward_cpu( - const at::Tensor& input, - const at::Tensor& rois, - double spatial_scale, - int64_t pooled_height, - int64_t pooled_width, - int64_t sampling_ratio, - bool aligned); - -VISION_API at::Tensor ROIAlign_backward_cpu( - const at::Tensor& grad, - const at::Tensor& rois, - double spatial_scale, - int64_t pooled_height, - int64_t pooled_width, - int64_t batch_size, - int64_t channels, - int64_t height, - int64_t width, - int64_t sampling_ratio, - bool aligned); - -VISION_API std::tuple ROIPool_forward_cpu( - const at::Tensor& input, - const at::Tensor& rois, - double spatial_scale, - int64_t pooled_height, - int64_t pooled_width); - -VISION_API at::Tensor ROIPool_backward_cpu( - const at::Tensor& grad, - const at::Tensor& rois, - const at::Tensor& argmax, - double spatial_scale, - int64_t pooled_height, - int64_t pooled_width, - int64_t batch_size, - int64_t channels, - int64_t height, - int64_t width); diff --git a/torchvision/csrc/cuda/cuda_helpers.h b/torchvision/csrc/cuda/cuda_helpers.h index a82e1c86f87..cec4a183899 100644 --- a/torchvision/csrc/cuda/cuda_helpers.h +++ b/torchvision/csrc/cuda/cuda_helpers.h @@ -1,5 +1,8 @@ #pragma once +namespace vision { +namespace ops { + #define CUDA_1D_KERNEL_LOOP(i, n) \ for (int i = (blockIdx.x * blockDim.x) + threadIdx.x; i < (n); \ i += (blockDim.x * gridDim.x)) @@ -8,3 +11,6 @@ template constexpr __host__ __device__ inline integer ceil_div(integer n, integer m) { return (n + m - 1) / m; } + +} // namespace ops +} // namespace vision diff --git a/torchvision/csrc/cuda/DeformConv_cuda.cu b/torchvision/csrc/cuda/deform_conv2d_kernel.cu similarity index 83% rename from torchvision/csrc/cuda/DeformConv_cuda.cu rename to torchvision/csrc/cuda/deform_conv2d_kernel.cu index 507532e7184..e530710863c 100644 --- a/torchvision/csrc/cuda/DeformConv_cuda.cu +++ b/torchvision/csrc/cuda/deform_conv2d_kernel.cu @@ -66,17 +66,17 @@ // modified from // https://github.com/open-mmlab/mmdetection/blob/master/mmdet/ops/dcn/src/deform_conv_cuda.cpp -#include -#include #include #include #include #include "cuda_helpers.h" +#include "deform_conv2d_kernel.h" -#include -#include -#include +namespace vision { +namespace ops { + +namespace { const int kMaxParallelImgs = 32; @@ -90,7 +90,9 @@ inline unsigned int GET_THREADS() { return 512; } -inline unsigned int GET_BLOCKS(const unsigned int THREADS, const unsigned int N) { +inline unsigned int GET_BLOCKS( + const unsigned int THREADS, + const unsigned int N) { unsigned int kMaxGridNum = at::cuda::getCurrentDeviceProperties()->maxGridSize[0]; return std::min(kMaxGridNum, (N + THREADS - 1) / THREADS); @@ -136,7 +138,7 @@ __device__ scalar_t bilinear_interpolate( } template -__global__ void deformable_im2col_gpu_kernel( +__global__ void deformable_im2col_kernel( int n, const scalar_t* input_ptr, const scalar_t* offset_ptr, @@ -149,8 +151,8 @@ __global__ void deformable_im2col_gpu_kernel( int pad_w, int stride_h, int stride_w, - int dil_h, - int dil_w, + int dilation_h, + int dilation_w, int batch_sz, int n_in_channels, int n_offset_grps, @@ -198,8 +200,10 @@ __global__ void deformable_im2col_gpu_kernel( offset_ptr[offset_idx * (out_h * out_w) + out_y * out_w + out_x]; const scalar_t offset_w = offset_ptr [(offset_idx + 1) * (out_h * out_w) + out_y * out_w + out_x]; - const scalar_t y = (out_y * stride_h - pad_h) + i * dil_h + offset_h; - const scalar_t x = (out_x * stride_w - pad_w) + j * dil_w + offset_w; + const scalar_t y = + (out_y * stride_h - pad_h) + i * dilation_h + offset_h; + const scalar_t x = + (out_x * stride_w - pad_w) + j * dilation_w + offset_w; *columns_ptr = mask_value * bilinear_interpolate(input_ptr, height, width, y, x); columns_ptr += batch_sz * out_h * out_w; @@ -208,7 +212,7 @@ __global__ void deformable_im2col_gpu_kernel( } } -static void deformable_im2col( +void deformable_im2col( const at::Tensor& input, const at::Tensor& data_offset, const at::Tensor& data_mask, @@ -221,8 +225,8 @@ static void deformable_im2col( int pad_w, int stride_h, int stride_w, - int dil_h, - int dil_w, + int dilation_h, + int dilation_w, int out_h, int out_w, int parallel_imgs, @@ -235,10 +239,8 @@ static void deformable_im2col( const unsigned int blocks = GET_BLOCKS(threads, num_kernels); AT_DISPATCH_FLOATING_TYPES_AND_HALF( - input.scalar_type(), "deformable_im2col_gpu", ([&] { - deformable_im2col_gpu_kernel<<< - blocks, - threads>>>( + input.scalar_type(), "deformable_im2col", ([&] { + deformable_im2col_kernel<<>>( num_kernels, input.data_ptr(), data_offset.data_ptr(), @@ -251,8 +253,8 @@ static void deformable_im2col( pad_w, stride_h, stride_w, - dil_h, - dil_w, + dilation_h, + dilation_w, parallel_imgs, n_in_channels, deformable_group, @@ -268,7 +270,7 @@ static void deformable_im2col( } } -static int get_greatest_divisor_below_bound(int n, int bound) { +int get_greatest_divisor_below_bound(int n, int bound) { for (int k = bound; k > 1; --k) { if (n % k == 0) { return k; @@ -277,217 +279,8 @@ static int get_greatest_divisor_below_bound(int n, int bound) { return 1; } -at::Tensor DeformConv2d_forward_cuda( - const at::Tensor& input_param, - const at::Tensor& weight_param, - const at::Tensor& offset_param, - const at::Tensor& mask_param, - const at::Tensor& bias_param, - int64_t stride_h, - int64_t stride_w, - int64_t pad_h, - int64_t pad_w, - int64_t dil_h, - int64_t dil_w, - int64_t n_weight_grps, - int64_t n_offset_grps, - bool use_mask) { - at::Tensor input = input_param.contiguous(); - at::Tensor offset = offset_param.contiguous(); - at::Tensor weight = weight_param.contiguous(); - at::Tensor mask = mask_param.contiguous(); - at::Tensor bias = bias_param.contiguous(); - - TORCH_CHECK(input.ndimension() == 4); - TORCH_CHECK(offset.ndimension() == 4); - TORCH_CHECK(!use_mask || mask.ndimension() == 4); - TORCH_CHECK(weight.ndimension() == 4); - TORCH_CHECK(input.is_cuda(), "input must be a CUDA tensor"); - - at::DeviceGuard guard(input.device()); - - int batch_sz = input.size(0); - int in_channels = input.size(1); - int in_h = input.size(2); - int in_w = input.size(3); - - int n_parallel_imgs = - get_greatest_divisor_below_bound(batch_sz, kMaxParallelImgs); - - int out_channels = weight.size(0); - int weight_h = weight.size(2); - int weight_w = weight.size(3); - - int ker_h = dil_h * (weight_h - 1) + 1; - int ker_w = dil_w * (weight_w - 1) + 1; - int out_h = ((in_h + 2 * pad_h - ker_h) / stride_h) + 1; - int out_w = ((in_w + 2 * pad_w - ker_w) / stride_w) + 1; - - TORCH_CHECK( - weight_h > 0 && weight_w > 0, - "weight_h: ", - weight_h, - " weight_w: ", - weight_w); - TORCH_CHECK( - stride_h > 0 && stride_w > 0, - "stride_h: ", - stride_h, - " stride_w: ", - stride_w); - TORCH_CHECK(pad_h >= 0 && pad_w >= 0, "pad_h: ", pad_h, " pad_w: ", pad_w); - TORCH_CHECK(dil_h > 0 && dil_w > 0, "dil_h: ", dil_h, " dil_w: ", dil_w); - - TORCH_CHECK(weight.size(1) * n_weight_grps == input.size(1)); - TORCH_CHECK(weight.size(0) % n_weight_grps == 0); - TORCH_CHECK( - (offset.size(1) == n_offset_grps * 2 * weight_h * weight_w), - "offset.shape[1] is not valid: got: ", - offset.size(1), - " expected: ", - n_offset_grps * 2 * weight_h * weight_w); - TORCH_CHECK( - (!use_mask || mask.size(1) == n_offset_grps * weight_h * weight_w), - "mask.shape[1] is not valid: got: ", - mask.size(1), - " expected: ", - n_offset_grps * weight_h * weight_w); - TORCH_CHECK(input.size(1) % n_offset_grps == 0); - - TORCH_CHECK( - (offset.size(0) == input.size(0)), "invalid batch size of offset"); - TORCH_CHECK( - (offset.size(2) == out_h && offset.size(3) == out_w), - "offset output dims: (", - offset.size(2), - ", ", - offset.size(3), - ") - ", - "computed output dims: (", - out_h, - ", ", - out_w, - ")"); - TORCH_CHECK((mask.size(0) == input.size(0)), "invalid batch size of mask"); - TORCH_CHECK( - (!use_mask || (mask.size(2) == out_h && mask.size(3) == out_w)), - "mask output dims: (", - mask.size(2), - ", ", - mask.size(3), - ") - ", - "computed output dims: (", - out_h, - ", ", - out_w, - ")"); - TORCH_CHECK( - out_h > 0 && out_w > 0, - "Calculated output size too small - out_h: ", - out_h, - " out_w: ", - out_w); - - auto out = at::zeros({batch_sz, out_channels, out_h, out_w}, input.options()); - if (batch_sz == 0) { - return out; - } - - // Separate batches into blocks - out = out.view({batch_sz / n_parallel_imgs, - n_parallel_imgs, - out_channels, - out_h, - out_w}); - input = input.view( - {batch_sz / n_parallel_imgs, n_parallel_imgs, in_channels, in_h, in_w}); - - offset = offset.view({batch_sz / n_parallel_imgs, - n_parallel_imgs, - n_offset_grps * 2 * weight_h * weight_w, - out_h, - out_w}); - - if (use_mask) { - mask = mask.view({batch_sz / n_parallel_imgs, - n_parallel_imgs, - n_offset_grps * weight_h * weight_w, - out_h, - out_w}); - } - - at::Tensor out_buf = at::zeros( - {batch_sz / n_parallel_imgs, - out_channels, - n_parallel_imgs * out_h, - out_w}, - out.options()); - - // Separate channels into convolution groups - out_buf = out_buf.view({out_buf.size(0), - n_weight_grps, - out_buf.size(1) / n_weight_grps, - out_buf.size(2), - out_buf.size(3)}); - weight = weight.view({n_weight_grps, - weight.size(0) / n_weight_grps, - weight.size(1), - weight.size(2), - weight.size(3)}); - - // Sample points and perform convolution - auto columns = at::zeros( - {in_channels * weight_h * weight_w, n_parallel_imgs * out_h * out_w}, - input.options()); - for (int b = 0; b < batch_sz / n_parallel_imgs; b++) { - deformable_im2col( - input[b], - offset[b], - mask[b], - in_channels, - in_h, - in_w, - weight_h, - weight_w, - pad_h, - pad_w, - stride_h, - stride_w, - dil_h, - dil_w, - out_h, - out_w, - n_parallel_imgs, - n_offset_grps, - use_mask, - columns); - - columns = columns.view( - {n_weight_grps, columns.size(0) / n_weight_grps, columns.size(1)}); - for (int g = 0; g < n_weight_grps; g++) { - out_buf[b][g] = out_buf[b][g] - .flatten(1) - .addmm_(weight[g].flatten(1), columns[g]) - .view_as(out_buf[b][g]); - } - columns = - columns.view({columns.size(0) * columns.size(1), columns.size(2)}); - } - - out_buf = out_buf.view({batch_sz / n_parallel_imgs, - out_channels, - n_parallel_imgs, - out_h, - out_w}); - out_buf.transpose_(1, 2); - out.copy_(out_buf); - out = out.view({batch_sz, out_channels, out_h, out_w}); - - return out + bias.view({1, out_channels, 1, 1}); -} - template -__global__ void deformable_col2im_gpu_kernel( +__global__ void deformable_col2im_kernel( int n, const scalar_t* col, const scalar_t* offset_ptr, @@ -560,7 +353,7 @@ __global__ void deformable_col2im_gpu_kernel( } } -static void compute_grad_input( +void compute_grad_input( const at::Tensor& columns, const at::Tensor& offset, const at::Tensor& mask, @@ -590,10 +383,8 @@ static void compute_grad_input( const unsigned int blocks = GET_BLOCKS(threads, num_kernels); AT_DISPATCH_FLOATING_TYPES_AND_HALF( - columns.scalar_type(), "deformable_col2im_gpu", ([&] { - deformable_col2im_gpu_kernel<<< - blocks, - threads>>>( + columns.scalar_type(), "compute_grad_input", ([&] { + deformable_col2im_kernel<<>>( num_kernels, columns.data_ptr(), offset.data_ptr(), @@ -657,7 +448,7 @@ __device__ scalar_t get_coordinate_weight( } template -__global__ void deformable_col2im_coord_gpu_kernel( +__global__ void deformable_col2im_coord_kernel( int n, const scalar_t* col_ptr, const scalar_t* im_ptr, @@ -766,7 +557,7 @@ __global__ void deformable_col2im_coord_gpu_kernel( } } -static void compute_grad_offset_and_mask( +void compute_grad_offset_and_mask( const at::Tensor& columns, const at::Tensor& input, const at::Tensor& offset, @@ -798,10 +589,8 @@ static void compute_grad_offset_and_mask( const unsigned int blocks = GET_BLOCKS(threads, num_kernels); AT_DISPATCH_FLOATING_TYPES_AND_HALF( - columns.scalar_type(), "deformable_col2im_coord_gpu", ([&] { - deformable_col2im_coord_gpu_kernel<<< - blocks, - threads>>>( + columns.scalar_type(), "compute_grad_offset_and_mask", ([&] { + deformable_col2im_coord_kernel<<>>( num_kernels, columns.data_ptr(), input.data_ptr(), @@ -835,7 +624,7 @@ static void compute_grad_offset_and_mask( } } -static std::tuple deform_conv2d_backward_input_cuda( +std::tuple backward_gradient_inputs( at::Tensor input, at::Tensor weight, at::Tensor offset, @@ -845,8 +634,8 @@ static std::tuple deform_conv2d_backward_inp int stride_w, int pad_h, int pad_w, - int dil_h, - int dil_w, + int dilation_h, + int dilation_w, int n_weight_grps, int n_offset_grps, int n_parallel_imgs, @@ -864,8 +653,10 @@ static std::tuple deform_conv2d_backward_inp int weight_h = weight.size(2); int weight_w = weight.size(3); - long out_w = (in_w + 2 * pad_w - (dil_w * (weight_w - 1) + 1)) / stride_w + 1; - long out_h = (in_h + 2 * pad_h - (dil_h * (weight_h - 1) + 1)) / stride_h + 1; + long out_w = + (in_w + 2 * pad_w - (dilation_w * (weight_w - 1) + 1)) / stride_w + 1; + long out_h = + (in_h + 2 * pad_h - (dilation_h * (weight_h - 1) + 1)) / stride_h + 1; auto grad_input = at::zeros_like(input); auto grad_offset = at::zeros_like(offset); @@ -948,8 +739,8 @@ static std::tuple deform_conv2d_backward_inp pad_w, stride_h, stride_w, - dil_h, - dil_w, + dilation_h, + dilation_w, n_parallel_imgs, n_offset_grps, use_mask, @@ -969,8 +760,8 @@ static std::tuple deform_conv2d_backward_inp pad_w, stride_h, stride_w, - dil_h, - dil_w, + dilation_h, + dilation_w, n_parallel_imgs, n_offset_grps, use_mask, @@ -989,7 +780,7 @@ static std::tuple deform_conv2d_backward_inp return std::make_tuple(grad_input, grad_offset, grad_mask); } -static at::Tensor deform_conv2d_backward_parameters_cuda( +at::Tensor backward_gradient_parameters( at::Tensor input, const at::Tensor& weight, at::Tensor offset, @@ -999,8 +790,8 @@ static at::Tensor deform_conv2d_backward_parameters_cuda( int stride_w, int pad_h, int pad_w, - int dil_h, - int dil_w, + int dilation_h, + int dilation_w, int n_weight_grps, int n_offset_grps, int n_parallel_imgs, @@ -1079,8 +870,8 @@ static at::Tensor deform_conv2d_backward_parameters_cuda( pad_w, stride_h, stride_w, - dil_h, - dil_w, + dilation_h, + dilation_w, out_h, out_w, n_parallel_imgs, @@ -1105,46 +896,264 @@ static at::Tensor deform_conv2d_backward_parameters_cuda( return grad_weight; } +} // namespace + +at::Tensor deform_conv2d_forward_cuda( + const at::Tensor& input, + const at::Tensor& weight, + const at::Tensor& offset, + const at::Tensor& mask, + const at::Tensor& bias, + int64_t stride_h, + int64_t stride_w, + int64_t pad_h, + int64_t pad_w, + int64_t dilation_h, + int64_t dilation_w, + int64_t n_weight_grps, + int64_t n_offset_grps, + bool use_mask) { + at::Tensor input_c = input.contiguous(); + at::Tensor offset_c = offset.contiguous(); + at::Tensor weight_c = weight.contiguous(); + at::Tensor mask_c = mask.contiguous(); + at::Tensor bias_c = bias.contiguous(); + + TORCH_CHECK(input_c.ndimension() == 4); + TORCH_CHECK(offset_c.ndimension() == 4); + TORCH_CHECK(!use_mask || mask_c.ndimension() == 4); + TORCH_CHECK(weight_c.ndimension() == 4); + TORCH_CHECK(input_c.is_cuda(), "input must be a CUDA tensor"); + + at::DeviceGuard guard(input_c.device()); + + int batch_sz = input_c.size(0); + int in_channels = input_c.size(1); + int in_h = input_c.size(2); + int in_w = input_c.size(3); + + int n_parallel_imgs = + get_greatest_divisor_below_bound(batch_sz, kMaxParallelImgs); + + int out_channels = weight_c.size(0); + int weight_h = weight_c.size(2); + int weight_w = weight_c.size(3); + + int ker_h = dilation_h * (weight_h - 1) + 1; + int ker_w = dilation_w * (weight_w - 1) + 1; + int out_h = ((in_h + 2 * pad_h - ker_h) / stride_h) + 1; + int out_w = ((in_w + 2 * pad_w - ker_w) / stride_w) + 1; + + TORCH_CHECK( + weight_h > 0 && weight_w > 0, + "weight_h: ", + weight_h, + " weight_w: ", + weight_w); + TORCH_CHECK( + stride_h > 0 && stride_w > 0, + "stride_h: ", + stride_h, + " stride_w: ", + stride_w); + TORCH_CHECK(pad_h >= 0 && pad_w >= 0, "pad_h: ", pad_h, " pad_w: ", pad_w); + TORCH_CHECK( + dilation_h > 0 && dilation_w > 0, + "dilation_h: ", + dilation_h, + " dilation_w: ", + dilation_w); + + TORCH_CHECK(weight_c.size(1) * n_weight_grps == input_c.size(1)); + TORCH_CHECK(weight_c.size(0) % n_weight_grps == 0); + TORCH_CHECK( + (offset_c.size(1) == n_offset_grps * 2 * weight_h * weight_w), + "offset.shape[1] is not valid: got: ", + offset_c.size(1), + " expected: ", + n_offset_grps * 2 * weight_h * weight_w); + TORCH_CHECK( + (!use_mask || mask_c.size(1) == n_offset_grps * weight_h * weight_w), + "mask.shape[1] is not valid: got: ", + mask_c.size(1), + " expected: ", + n_offset_grps * weight_h * weight_w); + TORCH_CHECK(input_c.size(1) % n_offset_grps == 0); + + TORCH_CHECK( + (offset_c.size(0) == input_c.size(0)), "invalid batch size of offset"); + TORCH_CHECK( + (offset_c.size(2) == out_h && offset_c.size(3) == out_w), + "offset output dims: (", + offset_c.size(2), + ", ", + offset_c.size(3), + ") - ", + "computed output dims: (", + out_h, + ", ", + out_w, + ")"); + TORCH_CHECK( + (mask_c.size(0) == input_c.size(0)), "invalid batch size of mask"); + TORCH_CHECK( + (!use_mask || (mask_c.size(2) == out_h && mask_c.size(3) == out_w)), + "mask output dims: (", + mask_c.size(2), + ", ", + mask_c.size(3), + ") - ", + "computed output dims: (", + out_h, + ", ", + out_w, + ")"); + TORCH_CHECK( + out_h > 0 && out_w > 0, + "Calculated output size too small - out_h: ", + out_h, + " out_w: ", + out_w); + + auto out = + at::zeros({batch_sz, out_channels, out_h, out_w}, input_c.options()); + if (batch_sz == 0) { + return out; + } + + // Separate batches into blocks + out = out.view({batch_sz / n_parallel_imgs, + n_parallel_imgs, + out_channels, + out_h, + out_w}); + input_c = input_c.view( + {batch_sz / n_parallel_imgs, n_parallel_imgs, in_channels, in_h, in_w}); + + offset_c = offset_c.view({batch_sz / n_parallel_imgs, + n_parallel_imgs, + n_offset_grps * 2 * weight_h * weight_w, + out_h, + out_w}); + + if (use_mask) { + mask_c = mask_c.view({batch_sz / n_parallel_imgs, + n_parallel_imgs, + n_offset_grps * weight_h * weight_w, + out_h, + out_w}); + } + + at::Tensor out_buf = at::zeros( + {batch_sz / n_parallel_imgs, + out_channels, + n_parallel_imgs * out_h, + out_w}, + out.options()); + + // Separate channels into convolution groups + out_buf = out_buf.view({out_buf.size(0), + n_weight_grps, + out_buf.size(1) / n_weight_grps, + out_buf.size(2), + out_buf.size(3)}); + weight_c = weight_c.view({n_weight_grps, + weight_c.size(0) / n_weight_grps, + weight_c.size(1), + weight_c.size(2), + weight_c.size(3)}); + + // Sample points and perform convolution + auto columns = at::zeros( + {in_channels * weight_h * weight_w, n_parallel_imgs * out_h * out_w}, + input_c.options()); + for (int b = 0; b < batch_sz / n_parallel_imgs; b++) { + deformable_im2col( + input_c[b], + offset_c[b], + mask_c[b], + in_channels, + in_h, + in_w, + weight_h, + weight_w, + pad_h, + pad_w, + stride_h, + stride_w, + dilation_h, + dilation_w, + out_h, + out_w, + n_parallel_imgs, + n_offset_grps, + use_mask, + columns); + + columns = columns.view( + {n_weight_grps, columns.size(0) / n_weight_grps, columns.size(1)}); + for (int g = 0; g < n_weight_grps; g++) { + out_buf[b][g] = out_buf[b][g] + .flatten(1) + .addmm_(weight_c[g].flatten(1), columns[g]) + .view_as(out_buf[b][g]); + } + columns = + columns.view({columns.size(0) * columns.size(1), columns.size(2)}); + } + + out_buf = out_buf.view({batch_sz / n_parallel_imgs, + out_channels, + n_parallel_imgs, + out_h, + out_w}); + out_buf.transpose_(1, 2); + out.copy_(out_buf); + out = out.view({batch_sz, out_channels, out_h, out_w}); + + return out + bias_c.view({1, out_channels, 1, 1}); +} + std::tuple -DeformConv2d_backward_cuda( - const at::Tensor& grad_out_param, - const at::Tensor& input_param, - const at::Tensor& weight_param, - const at::Tensor& offset_param, - const at::Tensor& mask_param, - const at::Tensor& bias_param, +deform_conv2d_backward_cuda( + const at::Tensor& grad_out, + const at::Tensor& input, + const at::Tensor& weight, + const at::Tensor& offset, + const at::Tensor& mask, + const at::Tensor& bias, int64_t stride_h, int64_t stride_w, int64_t pad_h, int64_t pad_w, - int64_t dil_h, - int64_t dil_w, + int64_t dilation_h, + int64_t dilation_w, int64_t n_weight_grps, int64_t n_offset_grps, bool use_mask) { - at::Tensor grad_out = grad_out_param.contiguous(); - at::Tensor input = input_param.contiguous(); - at::Tensor weight = weight_param.contiguous(); - at::Tensor offset = offset_param.contiguous(); - at::Tensor mask = mask_param.contiguous(); - at::Tensor bias = bias_param.contiguous(); - - const int batch_sz = input.size(0); + at::Tensor grad_out_c = grad_out.contiguous(); + at::Tensor input_c = input.contiguous(); + at::Tensor weight_c = weight.contiguous(); + at::Tensor offset_c = offset.contiguous(); + at::Tensor mask_c = mask.contiguous(); + at::Tensor bias_c = bias.contiguous(); + + const int batch_sz = input_c.size(0); const int n_parallel_imgs = get_greatest_divisor_below_bound(batch_sz, kMaxParallelImgs); - auto grad_input_and_offset_and_mask = deform_conv2d_backward_input_cuda( - input, - weight, - offset, - mask, - grad_out, + auto grad_input_and_offset_and_mask = backward_gradient_inputs( + input_c, + weight_c, + offset_c, + mask_c, + grad_out_c, stride_h, stride_w, pad_h, pad_w, - dil_h, - dil_w, + dilation_h, + dilation_w, n_weight_grps, n_offset_grps, n_parallel_imgs, @@ -1154,26 +1163,29 @@ DeformConv2d_backward_cuda( auto grad_offset = std::get<1>(grad_input_and_offset_and_mask); auto grad_mask = std::get<2>(grad_input_and_offset_and_mask); - auto grad_weight = deform_conv2d_backward_parameters_cuda( - input, - weight, - offset, - mask, - grad_out, + auto grad_weight = backward_gradient_parameters( + input_c, + weight_c, + offset_c, + mask_c, + grad_out_c, stride_h, stride_w, pad_h, pad_w, - dil_h, - dil_w, + dilation_h, + dilation_w, n_weight_grps, n_offset_grps, n_parallel_imgs, use_mask); - auto value = grad_out.sum({0, 2, 3}); - auto grad_bias = at::ones_like(bias) * value; + auto value = grad_out_c.sum({0, 2, 3}); + auto grad_bias = at::ones_like(bias_c) * value; return std::make_tuple( grad_input, grad_weight, grad_offset, grad_mask, grad_bias); } + +} // namespace ops +} // namespace vision diff --git a/torchvision/csrc/cuda/deform_conv2d_kernel.h b/torchvision/csrc/cuda/deform_conv2d_kernel.h new file mode 100644 index 00000000000..b2e3dc3f17f --- /dev/null +++ b/torchvision/csrc/cuda/deform_conv2d_kernel.h @@ -0,0 +1,45 @@ +#pragma once + +#include +#include "../macros.h" + +namespace vision { +namespace ops { + +VISION_API at::Tensor deform_conv2d_forward_cuda( + const at::Tensor& input, + const at::Tensor& weight, + const at::Tensor& offset, + const at::Tensor& mask, + const at::Tensor& bias, + int64_t stride_h, + int64_t stride_w, + int64_t pad_h, + int64_t pad_w, + int64_t dilation_h, + int64_t dilation_w, + int64_t n_weight_grps, + int64_t n_offset_grps, + bool use_mask); + +VISION_API std:: + tuple + deform_conv2d_backward_cuda( + const at::Tensor& grad_out, + const at::Tensor& input, + const at::Tensor& weight, + const at::Tensor& offset, + const at::Tensor& mask, + const at::Tensor& bias, + int64_t stride_h, + int64_t stride_w, + int64_t pad_h, + int64_t pad_w, + int64_t dilation_h, + int64_t dilation_w, + int64_t n_weight_grps, + int64_t n_offset_grps, + bool use_mask); + +} // namespace ops +} // namespace vision diff --git a/torchvision/csrc/cuda/nms_cuda.cu b/torchvision/csrc/cuda/nms_kernel.cu similarity index 88% rename from torchvision/csrc/cuda/nms_cuda.cu rename to torchvision/csrc/cuda/nms_kernel.cu index 548dc2f69cb..b8d4b3ce0ec 100644 --- a/torchvision/csrc/cuda/nms_cuda.cu +++ b/torchvision/csrc/cuda/nms_kernel.cu @@ -1,16 +1,21 @@ -#include #include #include #include "cuda_helpers.h" +#include "nms_kernel.h" -#include -#include +namespace vision { +namespace ops { + +namespace { int const threadsPerBlock = sizeof(unsigned long long) * 8; template -__device__ inline bool devIoU(T const* const a, T const* const b, const float threshold) { +__device__ inline bool devIoU( + T const* const a, + T const* const b, + const float threshold) { T left = max(a[0], b[0]), right = min(a[2], b[2]); T top = max(a[1], b[1]), bottom = min(a[3], b[3]); T width = max(right - left, (T)0), height = max(bottom - top, (T)0); @@ -21,7 +26,7 @@ __device__ inline bool devIoU(T const* const a, T const* const b, const float th } template -__global__ void nms_kernel( +__global__ void nms_kernel_impl( int n_boxes, double iou_threshold, const T* dev_boxes, @@ -29,7 +34,8 @@ __global__ void nms_kernel( const int row_start = blockIdx.y; const int col_start = blockIdx.x; - if (row_start > col_start) return; + if (row_start > col_start) + return; const int row_size = min(n_boxes - row_start * threadsPerBlock, threadsPerBlock); @@ -68,7 +74,10 @@ __global__ void nms_kernel( } } -at::Tensor nms_cuda(const at::Tensor& dets, +} // namespace + +at::Tensor nms_cuda( + const at::Tensor& dets, const at::Tensor& scores, double iou_threshold) { TORCH_CHECK(dets.is_cuda(), "dets must be a CUDA tensor"); @@ -118,8 +127,8 @@ at::Tensor nms_cuda(const at::Tensor& dets, cudaStream_t stream = at::cuda::getCurrentCUDAStream(); AT_DISPATCH_FLOATING_TYPES_AND_HALF( - dets_sorted.scalar_type(), "nms_kernel_cuda", [&] { - nms_kernel<<>>( + dets_sorted.scalar_type(), "nms_cuda", [&] { + nms_kernel_impl<<>>( dets_num, iou_threshold, dets_sorted.data_ptr(), @@ -127,7 +136,8 @@ at::Tensor nms_cuda(const at::Tensor& dets, }); at::Tensor mask_cpu = mask.to(at::kCPU); - unsigned long long* mask_host = (unsigned long long*)mask_cpu.data_ptr(); + unsigned long long* mask_host = + (unsigned long long*)mask_cpu.data_ptr(); std::vector remv(col_blocks); memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks); @@ -155,3 +165,6 @@ at::Tensor nms_cuda(const at::Tensor& dets, {keep.narrow(/*dim=*/0, /*start=*/0, /*length=*/num_to_keep) .to(order_t.device(), keep.scalar_type())}); } + +} // namespace ops +} // namespace vision diff --git a/torchvision/csrc/cuda/nms_kernel.h b/torchvision/csrc/cuda/nms_kernel.h new file mode 100644 index 00000000000..0d2c0838437 --- /dev/null +++ b/torchvision/csrc/cuda/nms_kernel.h @@ -0,0 +1,15 @@ +#pragma once + +#include +#include "../macros.h" + +namespace vision { +namespace ops { + +VISION_API at::Tensor nms_cuda( + const at::Tensor& dets, + const at::Tensor& scores, + double iou_threshold); + +} // namespace ops +} // namespace vision diff --git a/torchvision/csrc/cuda/PSROIAlign_cuda.cu b/torchvision/csrc/cuda/ps_roi_align_kernel.cu similarity index 91% rename from torchvision/csrc/cuda/PSROIAlign_cuda.cu rename to torchvision/csrc/cuda/ps_roi_align_kernel.cu index e6912d8c7ee..6b1e729b12d 100644 --- a/torchvision/csrc/cuda/PSROIAlign_cuda.cu +++ b/torchvision/csrc/cuda/ps_roi_align_kernel.cu @@ -1,11 +1,14 @@ -#include -#include #include #include #include -#include #include "cuda_helpers.h" +#include "ps_roi_align_kernel.h" + +namespace vision { +namespace ops { + +namespace { template __device__ T bilinear_interpolate( @@ -62,7 +65,7 @@ __device__ T bilinear_interpolate( } template -__global__ void PSROIAlignForwardCUDA( +__global__ void ps_roi_align_forward_kernel_impl( int nthreads, const T* input, const T spatial_scale, @@ -195,7 +198,7 @@ __device__ void bilinear_interpolate_gradient( } template -__global__ void PSROIAlignBackwardCUDA( +__global__ void ps_roi_align_backward_kernel_impl( int nthreads, const T* grad_output, const int* channel_mapping, @@ -292,7 +295,9 @@ __global__ void PSROIAlignBackwardCUDA( } } -std::tuple PSROIAlign_forward_cuda( +} // namespace + +std::tuple ps_roi_align_forward_cuda( const at::Tensor& input, const at::Tensor& rois, double spatial_scale, @@ -307,7 +312,7 @@ std::tuple PSROIAlign_forward_cuda( at::TensorArg input_t{input, "input", 1}, rois_t{rois, "rois", 2}; - at::CheckedFrom c = "PSROIAlign_forward_cuda"; + at::CheckedFrom c = "ps_roi_align_forward_cuda"; at::checkAllSameGPU(c, {input_t, rois_t}); at::checkAllSameType(c, {input_t, rois_t}); @@ -337,15 +342,14 @@ std::tuple PSROIAlign_forward_cuda( cudaStream_t stream = at::cuda::getCurrentCUDAStream(); dim3 grid(std::min( - ceil_div(static_cast(output_size), static_cast(512)), + ceil_div(static_cast(output_size), static_cast(512)), static_cast(4096))); dim3 block(512); - auto input_ = input.contiguous(), - rois_ = rois.contiguous(); + auto input_ = input.contiguous(), rois_ = rois.contiguous(); AT_DISPATCH_FLOATING_TYPES_AND_HALF( - input.scalar_type(), "PSROIAlign_forward", [&] { - PSROIAlignForwardCUDA<<>>( + input.scalar_type(), "ps_roi_align_forward_cuda", [&] { + ps_roi_align_forward_kernel_impl<<>>( output_size, input_.data_ptr(), spatial_scale, @@ -365,7 +369,7 @@ std::tuple PSROIAlign_forward_cuda( return std::make_tuple(output, channel_mapping); } -at::Tensor PSROIAlign_backward_cuda( +at::Tensor ps_roi_align_backward_cuda( const at::Tensor& grad, const at::Tensor& rois, const at::Tensor& channel_mapping, @@ -381,13 +385,12 @@ at::Tensor PSROIAlign_backward_cuda( TORCH_CHECK(grad.is_cuda(), "grad must be a CUDA tensor"); TORCH_CHECK(rois.is_cuda(), "rois must be a CUDA tensor"); TORCH_CHECK( - channel_mapping.is_cuda(), - "channel_mapping must be a CUDA tensor"); + channel_mapping.is_cuda(), "channel_mapping must be a CUDA tensor"); at::TensorArg grad_t{grad, "grad", 1}, rois_t{rois, "rois", 2}, channel_mapping_t{channel_mapping, "channel_mapping", 3}; - at::CheckedFrom c = "PSROIAlign_backward_cuda"; + at::CheckedFrom c = "ps_roi_align_backward_cuda"; at::checkAllSameGPU(c, {grad_t, rois_t, channel_mapping_t}); at::checkAllSameType(c, {grad_t, rois_t}); @@ -400,7 +403,7 @@ at::Tensor PSROIAlign_backward_cuda( cudaStream_t stream = at::cuda::getCurrentCUDAStream(); dim3 grid(std::min( - ceil_div(static_cast(grad.numel()), static_cast(512)), + ceil_div(static_cast(grad.numel()), static_cast(512)), static_cast(4096))); dim3 block(512); @@ -412,11 +415,10 @@ at::Tensor PSROIAlign_backward_cuda( int channels_out = channels / (pooled_height * pooled_width); - auto grad_ = grad.contiguous(), - rois_ = rois.contiguous(); + auto grad_ = grad.contiguous(), rois_ = rois.contiguous(); AT_DISPATCH_FLOATING_TYPES_AND_HALF( - grad.scalar_type(), "PSROIAlign_backward", [&] { - PSROIAlignBackwardCUDA<<>>( + grad.scalar_type(), "ps_roi_align_backward_cuda", [&] { + ps_roi_align_backward_kernel_impl<<>>( grad.numel(), grad_.data_ptr(), channel_mapping.data_ptr(), @@ -435,3 +437,6 @@ at::Tensor PSROIAlign_backward_cuda( AT_CUDA_CHECK(cudaGetLastError()); return grad_input; } + +} // namespace ops +} // namespace vision diff --git a/torchvision/csrc/cuda/ps_roi_align_kernel.h b/torchvision/csrc/cuda/ps_roi_align_kernel.h new file mode 100644 index 00000000000..c40e6fa55b1 --- /dev/null +++ b/torchvision/csrc/cuda/ps_roi_align_kernel.h @@ -0,0 +1,31 @@ +#pragma once + +#include +#include "../macros.h" + +namespace vision { +namespace ops { + +VISION_API std::tuple ps_roi_align_forward_cuda( + const at::Tensor& input, + const at::Tensor& rois, + double spatial_scale, + int64_t pooled_height, + int64_t pooled_width, + int64_t sampling_ratio); + +VISION_API at::Tensor ps_roi_align_backward_cuda( + const at::Tensor& grad, + const at::Tensor& rois, + const at::Tensor& channel_mapping, + double spatial_scale, + int64_t pooled_height, + int64_t pooled_width, + int64_t sampling_ratio, + int64_t batch_size, + int64_t channels, + int64_t height, + int64_t width); + +} // namespace ops +} // namespace vision diff --git a/torchvision/csrc/cuda/PSROIPool_cuda.cu b/torchvision/csrc/cuda/ps_roi_pool_kernel.cu similarity index 88% rename from torchvision/csrc/cuda/PSROIPool_cuda.cu rename to torchvision/csrc/cuda/ps_roi_pool_kernel.cu index ab6a50b009c..91fd25b4bb5 100644 --- a/torchvision/csrc/cuda/PSROIPool_cuda.cu +++ b/torchvision/csrc/cuda/ps_roi_pool_kernel.cu @@ -1,13 +1,17 @@ -#include -#include #include #include #include #include "cuda_helpers.h" +#include "ps_roi_pool_kernel.h" + +namespace vision { +namespace ops { + +namespace { template -__global__ void PSROIPoolForward( +__global__ void ps_roi_pool_forward_kernel_impl( int nthreads, const T* input, const T spatial_scale, @@ -73,7 +77,7 @@ __global__ void PSROIPoolForward( } template -__global__ void PSROIPoolBackward( +__global__ void ps_roi_pool_backward_kernel_impl( int nthreads, const T* grad_output, const int* channel_mapping, @@ -132,7 +136,9 @@ __global__ void PSROIPoolBackward( } } -std::tuple PSROIPool_forward_cuda( +} // namespace + +std::tuple ps_roi_pool_forward_cuda( const at::Tensor& input, const at::Tensor& rois, double spatial_scale, @@ -146,7 +152,7 @@ std::tuple PSROIPool_forward_cuda( at::TensorArg input_t{input, "input", 1}, rois_t{rois, "rois", 2}; - at::CheckedFrom c = "PSROIPool_forward_cuda"; + at::CheckedFrom c = "ps_roi_pool_forward_cuda"; at::checkAllSameGPU(c, {input_t, rois_t}); at::checkAllSameType(c, {input_t, rois_t}); @@ -176,15 +182,14 @@ std::tuple PSROIPool_forward_cuda( cudaStream_t stream = at::cuda::getCurrentCUDAStream(); dim3 grid(std::min( - ceil_div(static_cast(output_size), static_cast(512)), + ceil_div(static_cast(output_size), static_cast(512)), static_cast(4096))); dim3 block(512); - auto input_ = input.contiguous(), - rois_ = rois.contiguous(); + auto input_ = input.contiguous(), rois_ = rois.contiguous(); AT_DISPATCH_FLOATING_TYPES_AND_HALF( - input.scalar_type(), "PSROIPool_forward", [&] { - PSROIPoolForward<<>>( + input.scalar_type(), "ps_roi_pool_forward_cuda", [&] { + ps_roi_pool_forward_kernel_impl<<>>( output_size, input_.data_ptr(), spatial_scale, @@ -202,7 +207,7 @@ std::tuple PSROIPool_forward_cuda( return std::make_tuple(output, channel_mapping); } -at::Tensor PSROIPool_backward_cuda( +at::Tensor ps_roi_pool_backward_cuda( const at::Tensor& grad, const at::Tensor& rois, const at::Tensor& channel_mapping, @@ -217,13 +222,12 @@ at::Tensor PSROIPool_backward_cuda( TORCH_CHECK(grad.is_cuda(), "grad must be a CUDA tensor"); TORCH_CHECK(rois.is_cuda(), "rois must be a CUDA tensor"); TORCH_CHECK( - channel_mapping.is_cuda(), - "channel_mapping must be a CUDA tensor"); + channel_mapping.is_cuda(), "channel_mapping must be a CUDA tensor"); at::TensorArg grad_t{grad, "grad", 1}, rois_t{rois, "rois", 2}, channel_mapping_t{channel_mapping, "channel_mapping", 3}; - at::CheckedFrom c = "PSROIPool_backward_cuda"; + at::CheckedFrom c = "ps_roi_pool_backward_cuda"; at::checkAllSameGPU(c, {grad_t, rois_t, channel_mapping_t}); at::checkAllSameType(c, {grad_t, rois_t}); @@ -236,7 +240,7 @@ at::Tensor PSROIPool_backward_cuda( cudaStream_t stream = at::cuda::getCurrentCUDAStream(); dim3 grid(std::min( - ceil_div(static_cast(grad.numel()), static_cast(512)), + ceil_div(static_cast(grad.numel()), static_cast(512)), static_cast(4096))); dim3 block(512); @@ -248,11 +252,10 @@ at::Tensor PSROIPool_backward_cuda( int channels_out = channels / (pooled_height * pooled_width); - auto grad_ = grad.contiguous(), - rois_ = rois.contiguous(); + auto grad_ = grad.contiguous(), rois_ = rois.contiguous(); AT_DISPATCH_FLOATING_TYPES_AND_HALF( - grad.scalar_type(), "PSROIPool_backward", [&] { - PSROIPoolBackward<<>>( + grad.scalar_type(), "ps_roi_pool_backward_cuda", [&] { + ps_roi_pool_backward_kernel_impl<<>>( grad.numel(), grad_.data_ptr(), channel_mapping.data_ptr(), @@ -270,3 +273,6 @@ at::Tensor PSROIPool_backward_cuda( AT_CUDA_CHECK(cudaGetLastError()); return grad_input; } + +} // namespace ops +} // namespace vision diff --git a/torchvision/csrc/cuda/ps_roi_pool_kernel.h b/torchvision/csrc/cuda/ps_roi_pool_kernel.h new file mode 100644 index 00000000000..21015d4693b --- /dev/null +++ b/torchvision/csrc/cuda/ps_roi_pool_kernel.h @@ -0,0 +1,29 @@ +#pragma once + +#include +#include "../macros.h" + +namespace vision { +namespace ops { + +VISION_API std::tuple ps_roi_pool_forward_cuda( + const at::Tensor& input, + const at::Tensor& rois, + double spatial_scale, + int64_t pooled_height, + int64_t pooled_width); + +VISION_API at::Tensor ps_roi_pool_backward_cuda( + const at::Tensor& grad, + const at::Tensor& rois, + const at::Tensor& channel_mapping, + double spatial_scale, + int64_t pooled_height, + int64_t pooled_width, + int64_t batch_size, + int64_t channels, + int64_t height, + int64_t width); + +} // namespace ops +} // namespace vision diff --git a/torchvision/csrc/cuda/ROIAlign_cuda.cu b/torchvision/csrc/cuda/roi_align_kernel.cu similarity index 85% rename from torchvision/csrc/cuda/ROIAlign_cuda.cu rename to torchvision/csrc/cuda/roi_align_kernel.cu index b773121d2b9..59388faa6ad 100644 --- a/torchvision/csrc/cuda/ROIAlign_cuda.cu +++ b/torchvision/csrc/cuda/roi_align_kernel.cu @@ -1,10 +1,14 @@ -#include -#include #include #include #include #include "cuda_helpers.h" +#include "roi_align_kernel.h" + +namespace vision { +namespace ops { + +namespace { template __device__ T bilinear_interpolate( @@ -61,7 +65,7 @@ __device__ T bilinear_interpolate( } template -__global__ void RoIAlignForward( +__global__ void roi_align_forward_kernel_impl( int nthreads, const T* input, const T spatial_scale, @@ -197,7 +201,7 @@ __device__ void bilinear_interpolate_gradient( } template -__global__ void RoIAlignBackward( +__global__ void roi_align_backward_kernel_impl( int nthreads, const T* grad_output, const T spatial_scale, @@ -308,9 +312,11 @@ __global__ void RoIAlignBackward( } // ix } // iy } // CUDA_1D_KERNEL_LOOP -} // RoIAlignBackward +} -at::Tensor ROIAlign_forward_cuda( +} // namespace + +at::Tensor roi_align_forward_cuda( const at::Tensor& input, const at::Tensor& rois, double spatial_scale, @@ -320,12 +326,11 @@ at::Tensor ROIAlign_forward_cuda( bool aligned) { TORCH_CHECK(input.is_cuda(), "input must be a CUDA tensor"); TORCH_CHECK(rois.is_cuda(), "rois must be a CUDA tensor"); - TORCH_CHECK( - rois.size(1) == 5, "rois must have shape as Tensor[K, 5]"); + TORCH_CHECK(rois.size(1) == 5, "rois must have shape as Tensor[K, 5]"); at::TensorArg input_t{input, "input", 1}, rois_t{rois, "rois", 2}; - at::CheckedFrom c = "ROIAlign_forward_cuda"; + at::CheckedFrom c = "roi_align_forward_cuda"; at::checkAllSameGPU(c, {input_t, rois_t}); at::checkAllSameType(c, {input_t, rois_t}); @@ -343,7 +348,7 @@ at::Tensor ROIAlign_forward_cuda( cudaStream_t stream = at::cuda::getCurrentCUDAStream(); dim3 grid(std::min( - ceil_div(static_cast(output_size), static_cast(512)), + ceil_div(static_cast(output_size), static_cast(512)), static_cast(4096))); dim3 block(512); @@ -352,28 +357,28 @@ at::Tensor ROIAlign_forward_cuda( return output; } - auto input_ = input.contiguous(), - rois_ = rois.contiguous(); - AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.scalar_type(), "ROIAlign_forward", [&] { - RoIAlignForward<<>>( - output_size, - input_.data_ptr(), - spatial_scale, - channels, - height, - width, - pooled_height, - pooled_width, - sampling_ratio, - aligned, - rois_.data_ptr(), - output.data_ptr()); - }); + auto input_ = input.contiguous(), rois_ = rois.contiguous(); + AT_DISPATCH_FLOATING_TYPES_AND_HALF( + input.scalar_type(), "roi_align_forward_cuda", [&] { + roi_align_forward_kernel_impl<<>>( + output_size, + input_.data_ptr(), + spatial_scale, + channels, + height, + width, + pooled_height, + pooled_width, + sampling_ratio, + aligned, + rois_.data_ptr(), + output.data_ptr()); + }); AT_CUDA_CHECK(cudaGetLastError()); return output; } -at::Tensor ROIAlign_backward_cuda( +at::Tensor roi_align_backward_cuda( const at::Tensor& grad, const at::Tensor& rois, double spatial_scale, @@ -390,7 +395,7 @@ at::Tensor ROIAlign_backward_cuda( at::TensorArg grad_t{grad, "grad", 1}, rois_t{rois, "rois", 2}; - at::CheckedFrom c = "ROIAlign_backward_cuda"; + at::CheckedFrom c = "roi_align_backward_cuda"; at::checkAllSameGPU(c, {grad_t, rois_t}); at::checkAllSameType(c, {grad_t, rois_t}); @@ -402,7 +407,7 @@ at::Tensor ROIAlign_backward_cuda( cudaStream_t stream = at::cuda::getCurrentCUDAStream(); dim3 grid(std::min( - ceil_div(static_cast(grad.numel()), static_cast(512)), + ceil_div(static_cast(grad.numel()), static_cast(512)), static_cast(4096))); dim3 block(512); @@ -418,25 +423,29 @@ at::Tensor ROIAlign_backward_cuda( int w_stride = grad.stride(3); auto rois_ = rois.contiguous(); - AT_DISPATCH_FLOATING_TYPES_AND_HALF(grad.scalar_type(), "ROIAlign_backward", [&] { - RoIAlignBackward<<>>( - grad.numel(), - grad.data_ptr(), - spatial_scale, - channels, - height, - width, - pooled_height, - pooled_width, - sampling_ratio, - aligned, - grad_input.data_ptr(), - rois_.data_ptr(), - n_stride, - c_stride, - h_stride, - w_stride); - }); + AT_DISPATCH_FLOATING_TYPES_AND_HALF( + grad.scalar_type(), "roi_align_backward_cuda", [&] { + roi_align_backward_kernel_impl<<>>( + grad.numel(), + grad.data_ptr(), + spatial_scale, + channels, + height, + width, + pooled_height, + pooled_width, + sampling_ratio, + aligned, + grad_input.data_ptr(), + rois_.data_ptr(), + n_stride, + c_stride, + h_stride, + w_stride); + }); AT_CUDA_CHECK(cudaGetLastError()); return grad_input; } + +} // namespace ops +} // namespace vision diff --git a/torchvision/csrc/cuda/roi_align_kernel.h b/torchvision/csrc/cuda/roi_align_kernel.h new file mode 100644 index 00000000000..71096201627 --- /dev/null +++ b/torchvision/csrc/cuda/roi_align_kernel.h @@ -0,0 +1,32 @@ +#pragma once + +#include +#include "../macros.h" + +namespace vision { +namespace ops { + +VISION_API at::Tensor roi_align_forward_cuda( + const at::Tensor& input, + const at::Tensor& rois, + double spatial_scale, + int64_t pooled_height, + int64_t pooled_width, + int64_t sampling_ratio, + bool aligned); + +VISION_API at::Tensor roi_align_backward_cuda( + const at::Tensor& grad, + const at::Tensor& rois, + double spatial_scale, + int64_t pooled_height, + int64_t pooled_width, + int64_t batch_size, + int64_t channels, + int64_t height, + int64_t width, + int64_t sampling_ratio, + bool aligned); + +} // namespace ops +} // namespace vision diff --git a/torchvision/csrc/cuda/ROIPool_cuda.cu b/torchvision/csrc/cuda/roi_pool_kernel.cu similarity index 77% rename from torchvision/csrc/cuda/ROIPool_cuda.cu rename to torchvision/csrc/cuda/roi_pool_kernel.cu index 3131b9eea7e..a96e79c87a9 100644 --- a/torchvision/csrc/cuda/ROIPool_cuda.cu +++ b/torchvision/csrc/cuda/roi_pool_kernel.cu @@ -1,13 +1,18 @@ -#include -#include #include #include +#include #include #include "cuda_helpers.h" +#include "roi_pool_kernel.h" + +namespace vision { +namespace ops { + +namespace { template -__global__ void RoIPoolForward( +__global__ void roi_pool_forward_kernel_impl( int nthreads, const T* input, const T spatial_scale, @@ -72,7 +77,7 @@ __global__ void RoIPoolForward( } template -__global__ void RoIPoolBackward( +__global__ void roi_pool_backward_kernel_impl( int nthreads, const T* grad_output, const int* argmax_data, @@ -115,7 +120,9 @@ __global__ void RoIPoolBackward( } } -std::tuple ROIPool_forward_cuda( +} // namespace + +std::tuple roi_pool_forward_cuda( const at::Tensor& input, const at::Tensor& rois, double spatial_scale, @@ -128,7 +135,7 @@ std::tuple ROIPool_forward_cuda( at::TensorArg input_t{input, "input", 1}, rois_t{rois, "rois", 2}; - at::CheckedFrom c = "ROIPool_forward_cuda"; + at::CheckedFrom c = "roi_pool_forward_cuda"; at::checkAllSameGPU(c, {input_t, rois_t}); at::checkAllSameType(c, {input_t, rois_t}); @@ -149,7 +156,7 @@ std::tuple ROIPool_forward_cuda( cudaStream_t stream = at::cuda::getCurrentCUDAStream(); dim3 grid(std::min( - ceil_div(static_cast(output_size), static_cast(512)), + ceil_div(static_cast(output_size), static_cast(512)), static_cast(4096))); dim3 block(512); @@ -158,27 +165,27 @@ std::tuple ROIPool_forward_cuda( return std::make_tuple(output, argmax); } - auto input_ = input.contiguous(), - rois_ = rois.contiguous(); - AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.scalar_type(), "ROIPool_forward", [&] { - RoIPoolForward<<>>( - output_size, - input_.data_ptr(), - spatial_scale, - channels, - height, - width, - pooled_height, - pooled_width, - rois_.data_ptr(), - output.data_ptr(), - argmax.data_ptr()); - }); + auto input_ = input.contiguous(), rois_ = rois.contiguous(); + AT_DISPATCH_FLOATING_TYPES_AND_HALF( + input.scalar_type(), "roi_pool_forward_cuda", [&] { + roi_pool_forward_kernel_impl<<>>( + output_size, + input_.data_ptr(), + spatial_scale, + channels, + height, + width, + pooled_height, + pooled_width, + rois_.data_ptr(), + output.data_ptr(), + argmax.data_ptr()); + }); AT_CUDA_CHECK(cudaGetLastError()); return std::make_tuple(output, argmax); } -at::Tensor ROIPool_backward_cuda( +at::Tensor roi_pool_backward_cuda( const at::Tensor& grad, const at::Tensor& rois, const at::Tensor& argmax, @@ -197,7 +204,7 @@ at::Tensor ROIPool_backward_cuda( at::TensorArg grad_t{grad, "grad", 1}, rois_t{rois, "rois", 2}, argmax_t{argmax, "argmax", 3}; - at::CheckedFrom c = "ROIPool_backward_cuda"; + at::CheckedFrom c = "roi_pool_backward_cuda"; at::checkAllSameGPU(c, {grad_t, rois_t, argmax_t}); at::checkAllSameType(c, {grad_t, rois_t}); @@ -211,7 +218,7 @@ at::Tensor ROIPool_backward_cuda( cudaStream_t stream = at::cuda::getCurrentCUDAStream(); dim3 grid(std::min( - ceil_div(static_cast(grad.numel()), static_cast(512)), + ceil_div(static_cast(grad.numel()), static_cast(512)), static_cast(4096))); dim3 block(512); @@ -226,27 +233,30 @@ at::Tensor ROIPool_backward_cuda( int h_stride = grad.stride(2); int w_stride = grad.stride(3); - auto argmax_ = argmax.contiguous(), - rois_ = rois.contiguous(); - AT_DISPATCH_FLOATING_TYPES_AND_HALF(grad.scalar_type(), "ROIPool_backward", [&] { - RoIPoolBackward<<>>( - grad.numel(), - grad.data_ptr(), - argmax_.data_ptr(), - num_rois, - spatial_scale, - channels, - height, - width, - pooled_height, - pooled_width, - grad_input.data_ptr(), - rois_.data_ptr(), - n_stride, - c_stride, - h_stride, - w_stride); - }); + auto argmax_ = argmax.contiguous(), rois_ = rois.contiguous(); + AT_DISPATCH_FLOATING_TYPES_AND_HALF( + grad.scalar_type(), "roi_pool_backward_cuda", [&] { + roi_pool_backward_kernel_impl<<>>( + grad.numel(), + grad.data_ptr(), + argmax_.data_ptr(), + num_rois, + spatial_scale, + channels, + height, + width, + pooled_height, + pooled_width, + grad_input.data_ptr(), + rois_.data_ptr(), + n_stride, + c_stride, + h_stride, + w_stride); + }); AT_CUDA_CHECK(cudaGetLastError()); return grad_input; } + +} // namespace ops +} // namespace vision diff --git a/torchvision/csrc/cuda/roi_pool_kernel.h b/torchvision/csrc/cuda/roi_pool_kernel.h new file mode 100644 index 00000000000..71a649968db --- /dev/null +++ b/torchvision/csrc/cuda/roi_pool_kernel.h @@ -0,0 +1,29 @@ +#pragma once + +#include +#include "../macros.h" + +namespace vision { +namespace ops { + +VISION_API std::tuple roi_pool_forward_cuda( + const at::Tensor& input, + const at::Tensor& rois, + double spatial_scale, + int64_t pooled_height, + int64_t pooled_width); + +VISION_API at::Tensor roi_pool_backward_cuda( + const at::Tensor& grad, + const at::Tensor& rois, + const at::Tensor& argmax, + double spatial_scale, + int64_t pooled_height, + int64_t pooled_width, + int64_t batch_size, + int64_t channels, + int64_t height, + int64_t width); + +} // namespace ops +} // namespace vision diff --git a/torchvision/csrc/cuda/vision_cuda.h b/torchvision/csrc/cuda/vision_cuda.h deleted file mode 100644 index bf57f1c7967..00000000000 --- a/torchvision/csrc/cuda/vision_cuda.h +++ /dev/null @@ -1,124 +0,0 @@ -#pragma once -#include -#include "../macros.h" - -VISION_API at::Tensor DeformConv2d_forward_cuda( - const at::Tensor& input, - const at::Tensor& weight, - const at::Tensor& offset, - const at::Tensor& mask, - const at::Tensor& bias, - int64_t stride_h, - int64_t stride_w, - int64_t pad_h, - int64_t pad_w, - int64_t dilation_h, - int64_t dilation_w, - int64_t groups, - int64_t deformable_groups, - bool use_mask); - -VISION_API std:: - tuple - DeformConv2d_backward_cuda( - const at::Tensor& grad_out, - const at::Tensor& input, - const at::Tensor& weight, - const at::Tensor& offset, - const at::Tensor& mask, - const at::Tensor& bias, - int64_t stride_h, - int64_t stride_w, - int64_t pad_h, - int64_t pad_w, - int64_t dilation_h, - int64_t dilation_w, - int64_t groups, - int64_t deformable_groups, - bool use_mask); - -VISION_API at::Tensor nms_cuda( - const at::Tensor& dets, - const at::Tensor& scores, - double iou_threshold); - -VISION_API std::tuple PSROIAlign_forward_cuda( - const at::Tensor& input, - const at::Tensor& rois, - double spatial_scale, - int64_t pooled_height, - int64_t pooled_width, - int64_t sampling_ratio); - -VISION_API at::Tensor PSROIAlign_backward_cuda( - const at::Tensor& grad, - const at::Tensor& rois, - const at::Tensor& channel_mapping, - double spatial_scale, - int64_t pooled_height, - int64_t pooled_width, - int64_t sampling_ratio, - int64_t batch_size, - int64_t channels, - int64_t height, - int64_t width); - -VISION_API std::tuple PSROIPool_forward_cuda( - const at::Tensor& input, - const at::Tensor& rois, - double spatial_scale, - int64_t pooled_height, - int64_t pooled_width); - -VISION_API at::Tensor PSROIPool_backward_cuda( - const at::Tensor& grad, - const at::Tensor& rois, - const at::Tensor& channel_mapping, - double spatial_scale, - int64_t pooled_height, - int64_t pooled_width, - int64_t batch_size, - int64_t channels, - int64_t height, - int64_t width); - -VISION_API at::Tensor ROIAlign_forward_cuda( - const at::Tensor& input, - const at::Tensor& rois, - double spatial_scale, - int64_t pooled_height, - int64_t pooled_width, - int64_t sampling_ratio, - bool aligned); - -VISION_API at::Tensor ROIAlign_backward_cuda( - const at::Tensor& grad, - const at::Tensor& rois, - double spatial_scale, - int64_t pooled_height, - int64_t pooled_width, - int64_t batch_size, - int64_t channels, - int64_t height, - int64_t width, - int64_t sampling_ratio, - bool aligned); - -VISION_API std::tuple ROIPool_forward_cuda( - const at::Tensor& input, - const at::Tensor& rois, - const double spatial_scale, - const int64_t pooled_height, - const int64_t pooled_width); - -VISION_API at::Tensor ROIPool_backward_cuda( - const at::Tensor& grad, - const at::Tensor& rois, - const at::Tensor& argmax, - const double spatial_scale, - const int64_t pooled_height, - const int64_t pooled_width, - const int64_t batch_size, - const int64_t channels, - const int64_t height, - const int64_t width); diff --git a/torchvision/csrc/DeformConv.h b/torchvision/csrc/deform_conv2d.cpp similarity index 96% rename from torchvision/csrc/DeformConv.h rename to torchvision/csrc/deform_conv2d.cpp index f8a8dba60e6..e8a416683f2 100644 --- a/torchvision/csrc/DeformConv.h +++ b/torchvision/csrc/deform_conv2d.cpp @@ -1,17 +1,12 @@ -#pragma once +#include "deform_conv2d.h" +#include -#include "cpu/vision_cpu.h" - -#ifdef WITH_CUDA -#include "autocast.h" -#include "cuda/vision_cuda.h" -#endif -#ifdef WITH_HIP -#include "autocast.h" -#include "hip/vision_cuda.h" +#if defined(WITH_CUDA) || defined(WITH_HIP) +#include #endif -// TODO: put this stuff in torchvision namespace +namespace vision { +namespace ops { at::Tensor deform_conv2d( const at::Tensor& input, @@ -49,7 +44,7 @@ at::Tensor deform_conv2d( } #if defined(WITH_CUDA) || defined(WITH_HIP) -at::Tensor DeformConv2d_autocast( +at::Tensor deform_conv2d_autocast( const at::Tensor& input, const at::Tensor& weight, const at::Tensor& offset, @@ -123,6 +118,8 @@ _deform_conv2d_backward( use_mask); } +namespace { + class DeformConv2dFunction : public torch::autograd::Function { public: @@ -297,7 +294,9 @@ class DeformConv2dBackwardFunction } }; -at::Tensor DeformConv2d_autograd( +} // namespace + +at::Tensor deform_conv2d_autograd( const at::Tensor& input, const at::Tensor& weight, const at::Tensor& offset, @@ -330,7 +329,7 @@ at::Tensor DeformConv2d_autograd( } std::tuple -DeformConv2d_backward_autograd( +deform_conv2d_backward_autograd( const at::Tensor& grad, const at::Tensor& input, const at::Tensor& weight, @@ -365,3 +364,6 @@ DeformConv2d_backward_autograd( return std::make_tuple(result[0], result[1], result[2], result[3], result[4]); } + +} // namespace ops +} // namespace vision diff --git a/torchvision/csrc/deform_conv2d.h b/torchvision/csrc/deform_conv2d.h new file mode 100644 index 00000000000..85675ee6bf2 --- /dev/null +++ b/torchvision/csrc/deform_conv2d.h @@ -0,0 +1,106 @@ +#pragma once + +#include "cpu/deform_conv2d_kernel.h" + +#ifdef WITH_CUDA +#include "cuda/deform_conv2d_kernel.h" +#endif +#ifdef WITH_HIP +#include "hip/deform_conv2d_kernel.h" +#endif + +namespace vision { +namespace ops { + +// C++ Forward +at::Tensor deform_conv2d( + const at::Tensor& input, + const at::Tensor& weight, + const at::Tensor& offset, + const at::Tensor& mask, + const at::Tensor& bias, + int64_t stride_h, + int64_t stride_w, + int64_t pad_h, + int64_t pad_w, + int64_t dilation_h, + int64_t dilation_w, + int64_t groups, + int64_t offset_groups, + bool use_mask); + +// Autocast Forward +#if defined(WITH_CUDA) || defined(WITH_HIP) +at::Tensor deform_conv2d_autocast( + const at::Tensor& input, + const at::Tensor& weight, + const at::Tensor& offset, + const at::Tensor& mask, + const at::Tensor& bias, + int64_t stride_h, + int64_t stride_w, + int64_t pad_h, + int64_t pad_w, + int64_t dilation_h, + int64_t dilation_w, + int64_t groups, + int64_t offset_groups, + bool use_mask); +#endif + +// C++ Backward +std::tuple +_deform_conv2d_backward( + const at::Tensor& grad, + const at::Tensor& input, + const at::Tensor& weight, + const at::Tensor& offset, + const at::Tensor& mask, + const at::Tensor& bias, + int64_t stride_h, + int64_t stride_w, + int64_t pad_h, + int64_t pad_w, + int64_t dilation_h, + int64_t dilation_w, + int64_t groups, + int64_t offset_groups, + bool use_mask); + +// Autograd Forward and Backward +at::Tensor deform_conv2d_autograd( + const at::Tensor& input, + const at::Tensor& weight, + const at::Tensor& offset, + const at::Tensor& mask, + const at::Tensor& bias, + int64_t stride_h, + int64_t stride_w, + int64_t pad_h, + int64_t pad_w, + int64_t dilation_h, + int64_t dilation_w, + int64_t groups, + int64_t offset_groups, + bool use_mask); + +std::tuple +deform_conv2d_backward_autograd( + const at::Tensor& grad, + const at::Tensor& input, + const at::Tensor& weight, + const at::Tensor& offset, + const at::Tensor& mask, + const at::Tensor& bias, + int64_t stride_h, + int64_t stride_w, + int64_t pad_h, + int64_t pad_w, + int64_t dilation_h, + int64_t dilation_w, + int64_t groups, + int64_t offset_groups, + bool use_mask); + +} // namespace ops +} // namespace vision diff --git a/torchvision/csrc/macros.h b/torchvision/csrc/macros.h index cb01005a022..559140a933a 100644 --- a/torchvision/csrc/macros.h +++ b/torchvision/csrc/macros.h @@ -1,5 +1,4 @@ -#ifndef TORCHVISION_MACROS_H -#define TORCHVISION_MACROS_H +#pragma once #ifdef _WIN32 #if defined(torchvision_EXPORTS) @@ -20,5 +19,3 @@ #define VISION_INLINE_VARIABLE __attribute__((weak)) #endif #endif - -#endif // TORCHVISION_MACROS_H diff --git a/torchvision/csrc/models/alexnet.h b/torchvision/csrc/models/alexnet.h index 673598d3a53..d2529c88882 100644 --- a/torchvision/csrc/models/alexnet.h +++ b/torchvision/csrc/models/alexnet.h @@ -1,8 +1,7 @@ -#ifndef ALEXNET_H -#define ALEXNET_H +#pragma once #include -#include "general.h" +#include "../macros.h" namespace vision { namespace models { @@ -20,5 +19,3 @@ TORCH_MODULE(AlexNet); } // namespace models } // namespace vision - -#endif // ALEXNET_H diff --git a/torchvision/csrc/models/densenet.h b/torchvision/csrc/models/densenet.h index 731d0c7879f..9358631e4ba 100644 --- a/torchvision/csrc/models/densenet.h +++ b/torchvision/csrc/models/densenet.h @@ -1,8 +1,7 @@ -#ifndef DENSENET_H -#define DENSENET_H +#pragma once #include -#include "general.h" +#include "../macros.h" namespace vision { namespace models { @@ -82,5 +81,3 @@ TORCH_MODULE(DenseNet161); } // namespace models } // namespace vision - -#endif // DENSENET_H diff --git a/torchvision/csrc/models/general.h b/torchvision/csrc/models/general.h deleted file mode 100644 index 1378a1c85d2..00000000000 --- a/torchvision/csrc/models/general.h +++ /dev/null @@ -1,14 +0,0 @@ -#ifndef VISION_GENERAL_H -#define VISION_GENERAL_H - -#ifdef _WIN32 -#if defined(torchvision_EXPORTS) -#define VISION_API __declspec(dllexport) -#else -#define VISION_API __declspec(dllimport) -#endif -#else -#define VISION_API -#endif - -#endif // VISION_GENERAL_H \ No newline at end of file diff --git a/torchvision/csrc/models/googlenet.h b/torchvision/csrc/models/googlenet.h index 34b0cf5077f..3d4b3faf7e4 100644 --- a/torchvision/csrc/models/googlenet.h +++ b/torchvision/csrc/models/googlenet.h @@ -1,8 +1,7 @@ -#ifndef GOOGLENET_H -#define GOOGLENET_H +#pragma once #include -#include "general.h" +#include "../macros.h" namespace vision { namespace models { @@ -86,5 +85,3 @@ TORCH_MODULE(GoogLeNet); } // namespace models } // namespace vision - -#endif // GOOGLENET_H diff --git a/torchvision/csrc/models/inception.h b/torchvision/csrc/models/inception.h index 58f1a7c1054..08f329fbc1b 100644 --- a/torchvision/csrc/models/inception.h +++ b/torchvision/csrc/models/inception.h @@ -1,8 +1,7 @@ -#ifndef INCEPTION_H -#define INCEPTION_H +#pragma once #include -#include "general.h" +#include "../macros.h" namespace vision { namespace models { @@ -124,5 +123,3 @@ TORCH_MODULE(InceptionV3); } // namespace models } // namespace vision - -#endif // INCEPTION_H diff --git a/torchvision/csrc/models/mnasnet.h b/torchvision/csrc/models/mnasnet.h index 6f45101f332..e69559b2dd7 100644 --- a/torchvision/csrc/models/mnasnet.h +++ b/torchvision/csrc/models/mnasnet.h @@ -1,8 +1,7 @@ -#ifndef MNASNET_H -#define MNASNET_H +#pragma once #include -#include "general.h" +#include "../macros.h" namespace vision { namespace models { @@ -43,5 +42,3 @@ TORCH_MODULE(MNASNet1_3); } // namespace models } // namespace vision - -#endif // MNASNET_H diff --git a/torchvision/csrc/models/mobilenet.h b/torchvision/csrc/models/mobilenet.h index 6af0a597e1f..63d26474b23 100644 --- a/torchvision/csrc/models/mobilenet.h +++ b/torchvision/csrc/models/mobilenet.h @@ -1,8 +1,7 @@ -#ifndef MOBILENET_H -#define MOBILENET_H +#pragma once #include -#include "general.h" +#include "../macros.h" namespace vision { namespace models { @@ -22,5 +21,3 @@ struct VISION_API MobileNetV2Impl : torch::nn::Module { TORCH_MODULE(MobileNetV2); } // namespace models } // namespace vision - -#endif // MOBILENET_H diff --git a/torchvision/csrc/models/models.h b/torchvision/csrc/models/models.h index 1d47f2e3dd6..8376ed12020 100644 --- a/torchvision/csrc/models/models.h +++ b/torchvision/csrc/models/models.h @@ -1,5 +1,4 @@ -#ifndef MODELS_H -#define MODELS_H +#pragma once #include "alexnet.h" #include "densenet.h" @@ -11,5 +10,3 @@ #include "shufflenetv2.h" #include "squeezenet.h" #include "vgg.h" - -#endif // MODELS_H diff --git a/torchvision/csrc/models/modelsimpl.h b/torchvision/csrc/models/modelsimpl.h index 1dc8d06b15e..8f7663b32ad 100644 --- a/torchvision/csrc/models/modelsimpl.h +++ b/torchvision/csrc/models/modelsimpl.h @@ -1,5 +1,4 @@ -#ifndef MODELSIMPL_H -#define MODELSIMPL_H +#pragma once #include @@ -42,5 +41,3 @@ inline bool double_compare(double a, double b) { } // namespace modelsimpl } // namespace models } // namespace vision - -#endif // MODELSIMPL_H diff --git a/torchvision/csrc/models/resnet.h b/torchvision/csrc/models/resnet.h index e17dfe49a1a..b8caf4332c6 100644 --- a/torchvision/csrc/models/resnet.h +++ b/torchvision/csrc/models/resnet.h @@ -1,8 +1,7 @@ -#ifndef RESNET_H -#define RESNET_H +#pragma once #include -#include "general.h" +#include "../macros.h" namespace vision { namespace models { @@ -256,5 +255,3 @@ TORCH_MODULE(WideResNet101_2); } // namespace models } // namespace vision - -#endif // RESNET_H diff --git a/torchvision/csrc/models/shufflenetv2.h b/torchvision/csrc/models/shufflenetv2.h index 96d53d554ab..ea8524c72fa 100644 --- a/torchvision/csrc/models/shufflenetv2.h +++ b/torchvision/csrc/models/shufflenetv2.h @@ -1,8 +1,7 @@ -#ifndef SHUFFLENETV2_H -#define SHUFFLENETV2_H +#pragma once #include -#include "general.h" +#include "../macros.h" namespace vision { namespace models { @@ -44,5 +43,3 @@ TORCH_MODULE(ShuffleNetV2_x2_0); } // namespace models } // namespace vision - -#endif // SHUFFLENETV2_H diff --git a/torchvision/csrc/models/squeezenet.h b/torchvision/csrc/models/squeezenet.h index 6f55387de56..c43d2b07f13 100644 --- a/torchvision/csrc/models/squeezenet.h +++ b/torchvision/csrc/models/squeezenet.h @@ -1,8 +1,7 @@ -#ifndef SQUEEZENET_H -#define SQUEEZENET_H +#pragma once #include -#include "general.h" +#include "../macros.h" namespace vision { namespace models { @@ -36,5 +35,3 @@ TORCH_MODULE(SqueezeNet1_1); } // namespace models } // namespace vision - -#endif // SQUEEZENET_H diff --git a/torchvision/csrc/models/vgg.h b/torchvision/csrc/models/vgg.h index b5c600a68ab..4a540bea822 100644 --- a/torchvision/csrc/models/vgg.h +++ b/torchvision/csrc/models/vgg.h @@ -1,8 +1,7 @@ -#ifndef VGG_H -#define VGG_H +#pragma once #include -#include "general.h" +#include "../macros.h" namespace vision { namespace models { @@ -89,5 +88,3 @@ TORCH_MODULE(VGG19BN); } // namespace models } // namespace vision - -#endif // VGG_H diff --git a/torchvision/csrc/empty_tensor_op.h b/torchvision/csrc/new_empty_tensor_op.cpp similarity index 62% rename from torchvision/csrc/empty_tensor_op.h rename to torchvision/csrc/new_empty_tensor_op.cpp index 99448109762..30941d52ef7 100644 --- a/torchvision/csrc/empty_tensor_op.h +++ b/torchvision/csrc/new_empty_tensor_op.cpp @@ -1,14 +1,17 @@ -#pragma once +#include "new_empty_tensor_op.h" +#include -// All pure C++ headers for the C++ frontend. -#include +namespace vision { +namespace ops { + +namespace { class NewEmptyTensorOp : public torch::autograd::Function { public: static torch::autograd::variable_list forward( torch::autograd::AutogradContext* ctx, - torch::autograd::Variable input, - c10::List new_shape) { + const torch::autograd::Variable& input, + const c10::List& new_shape) { ctx->saved_data["shape"] = input.sizes(); std::vector shape(new_shape.begin(), new_shape.end()); return {input.new_empty(shape, at::TensorOptions())}; @@ -16,7 +19,7 @@ class NewEmptyTensorOp : public torch::autograd::Function { static torch::autograd::variable_list backward( torch::autograd::AutogradContext* ctx, - torch::autograd::variable_list grad_output) { + const torch::autograd::variable_list& grad_output) { // Use data saved in forward auto shape = ctx->saved_data["shape"].toIntList(); auto out = forward(ctx, grad_output[0], shape); @@ -24,6 +27,13 @@ class NewEmptyTensorOp : public torch::autograd::Function { } }; -at::Tensor new_empty_tensor(const at::Tensor& input, c10::List shape) { +} // namespace + +at::Tensor new_empty_tensor( + const at::Tensor& input, + const c10::List& shape) { return NewEmptyTensorOp::apply(input, shape)[0]; } + +} // namespace ops +} // namespace vision diff --git a/torchvision/csrc/new_empty_tensor_op.h b/torchvision/csrc/new_empty_tensor_op.h new file mode 100644 index 00000000000..f00cb67b779 --- /dev/null +++ b/torchvision/csrc/new_empty_tensor_op.h @@ -0,0 +1,13 @@ +#pragma once + +#include + +namespace vision { +namespace ops { + +at::Tensor new_empty_tensor( + const at::Tensor& input, + const c10::List& shape); + +} // namespace ops +} // namespace vision diff --git a/torchvision/csrc/nms.cpp b/torchvision/csrc/nms.cpp new file mode 100644 index 00000000000..2f9dbee9a32 --- /dev/null +++ b/torchvision/csrc/nms.cpp @@ -0,0 +1,35 @@ +#include "nms.h" +#include + +#if defined(WITH_CUDA) || defined(WITH_HIP) +#include +#endif + +namespace vision { +namespace ops { + +at::Tensor nms( + const at::Tensor& dets, + const at::Tensor& scores, + double iou_threshold) { + static auto op = c10::Dispatcher::singleton() + .findSchemaOrThrow("torchvision::nms", "") + .typed(); + return op.call(dets, scores, iou_threshold); +} + +#if defined(WITH_CUDA) || defined(WITH_HIP) +at::Tensor nms_autocast( + const at::Tensor& dets, + const at::Tensor& scores, + double iou_threshold) { + c10::impl::ExcludeDispatchKeyGuard no_autocast(c10::DispatchKey::Autocast); + return nms( + at::autocast::cached_cast(at::kFloat, dets), + at::autocast::cached_cast(at::kFloat, scores), + iou_threshold); +} +#endif + +} // namespace ops +} // namespace vision diff --git a/torchvision/csrc/nms.h b/torchvision/csrc/nms.h index aed675e5d26..ac7cbc53caf 100644 --- a/torchvision/csrc/nms.h +++ b/torchvision/csrc/nms.h @@ -1,36 +1,30 @@ #pragma once -#include "cpu/vision_cpu.h" +#include "cpu/nms_kernel.h" #ifdef WITH_CUDA -#include "autocast.h" -#include "cuda/vision_cuda.h" +#include "cuda/nms_kernel.h" #endif #ifdef WITH_HIP -#include "autocast.h" -#include "hip/vision_cuda.h" +#include "hip/nms_kernel.h" #endif -// nms dispatch nexus +namespace vision { +namespace ops { + +// C++ Forward at::Tensor nms( const at::Tensor& dets, const at::Tensor& scores, - double iou_threshold) { - static auto op = c10::Dispatcher::singleton() - .findSchemaOrThrow("torchvision::nms", "") - .typed(); - return op.call(dets, scores, iou_threshold); -} + double iou_threshold); +// Autocast Forward #if defined(WITH_CUDA) || defined(WITH_HIP) at::Tensor nms_autocast( const at::Tensor& dets, const at::Tensor& scores, - double iou_threshold) { - c10::impl::ExcludeDispatchKeyGuard no_autocast(c10::DispatchKey::Autocast); - return nms( - at::autocast::cached_cast(at::kFloat, dets), - at::autocast::cached_cast(at::kFloat, scores), - iou_threshold); -} + double iou_threshold); #endif + +} // namespace ops +} // namespace vision diff --git a/torchvision/csrc/PSROIAlign.h b/torchvision/csrc/ps_roi_align.cpp similarity index 93% rename from torchvision/csrc/PSROIAlign.h rename to torchvision/csrc/ps_roi_align.cpp index 1e5dd17aabc..5add21aaeec 100644 --- a/torchvision/csrc/PSROIAlign.h +++ b/torchvision/csrc/ps_roi_align.cpp @@ -1,19 +1,12 @@ -#pragma once +#include "ps_roi_align.h" +#include -#include "cpu/vision_cpu.h" - -#ifdef WITH_CUDA -#include "autocast.h" -#include "cuda/vision_cuda.h" -#endif -#ifdef WITH_HIP -#include "autocast.h" -#include "hip/vision_cuda.h" +#if defined(WITH_CUDA) || defined(WITH_HIP) +#include #endif -#include - -// TODO: put this stuff in torchvision namespace +namespace vision { +namespace ops { std::tuple ps_roi_align( const at::Tensor& input, @@ -30,7 +23,7 @@ std::tuple ps_roi_align( } #if defined(WITH_CUDA) || defined(WITH_HIP) -std::tuple PSROIAlign_autocast( +std::tuple ps_roi_align_autocast( const at::Tensor& input, const at::Tensor& rois, double spatial_scale, @@ -82,6 +75,8 @@ at::Tensor _ps_roi_align_backward( width); } +namespace { + class PSROIAlignFunction : public torch::autograd::Function { public: @@ -186,7 +181,9 @@ class PSROIAlignBackwardFunction } }; -std::tuple PSROIAlign_autograd( +} // namespace + +std::tuple ps_roi_align_autograd( const at::Tensor& input, const at::Tensor& rois, double spatial_scale, @@ -199,7 +196,7 @@ std::tuple PSROIAlign_autograd( return std::make_tuple(result[0], result[1]); } -at::Tensor PSROIAlign_backward_autograd( +at::Tensor ps_roi_align_backward_autograd( const at::Tensor& grad, const at::Tensor& rois, const at::Tensor& channel_mapping, @@ -224,3 +221,6 @@ at::Tensor PSROIAlign_backward_autograd( height, width)[0]; } + +} // namespace ops +} // namespace vision diff --git a/torchvision/csrc/ps_roi_align.h b/torchvision/csrc/ps_roi_align.h new file mode 100644 index 00000000000..c21107df4f4 --- /dev/null +++ b/torchvision/csrc/ps_roi_align.h @@ -0,0 +1,72 @@ +#pragma once + +#include "cpu/ps_roi_align_kernel.h" + +#ifdef WITH_CUDA +#include "cuda/ps_roi_align_kernel.h" +#endif +#ifdef WITH_HIP +#include "hip/ps_roi_align_kernel.h" +#endif + +namespace vision { +namespace ops { + +// C++ Forward +std::tuple ps_roi_align( + const at::Tensor& input, + const at::Tensor& rois, + double spatial_scale, + int64_t pooled_height, + int64_t pooled_width, + int64_t sampling_ratio); + +// Autocast Forward +#if defined(WITH_CUDA) || defined(WITH_HIP) +std::tuple ps_roi_align_autocast( + const at::Tensor& input, + const at::Tensor& rois, + double spatial_scale, + int64_t pooled_height, + int64_t pooled_width, + int64_t sampling_ratio); +#endif + +// C++ Backward +at::Tensor _ps_roi_align_backward( + const at::Tensor& grad, + const at::Tensor& rois, + const at::Tensor& channel_mapping, + double spatial_scale, + int64_t pooled_height, + int64_t pooled_width, + int64_t sampling_ratio, + int64_t batch_size, + int64_t channels, + int64_t height, + int64_t width); + +// Autograd Forward and Backward +std::tuple ps_roi_align_autograd( + const at::Tensor& input, + const at::Tensor& rois, + double spatial_scale, + int64_t pooled_height, + int64_t pooled_width, + int64_t sampling_ratio); + +at::Tensor ps_roi_align_backward_autograd( + const at::Tensor& grad, + const at::Tensor& rois, + const at::Tensor& channel_mapping, + double spatial_scale, + int64_t pooled_height, + int64_t pooled_width, + int64_t sampling_ratio, + int64_t batch_size, + int64_t channels, + int64_t height, + int64_t width); + +} // namespace ops +} // namespace vision diff --git a/torchvision/csrc/PSROIPool.h b/torchvision/csrc/ps_roi_pool.cpp similarity index 93% rename from torchvision/csrc/PSROIPool.h rename to torchvision/csrc/ps_roi_pool.cpp index c3ced9e7842..88a733a6369 100644 --- a/torchvision/csrc/PSROIPool.h +++ b/torchvision/csrc/ps_roi_pool.cpp @@ -1,17 +1,12 @@ -#pragma once +#include "ps_roi_pool.h" +#include -#include "cpu/vision_cpu.h" - -#ifdef WITH_CUDA -#include "autocast.h" -#include "cuda/vision_cuda.h" -#endif -#ifdef WITH_HIP -#include "autocast.h" -#include "hip/vision_cuda.h" +#if defined(WITH_CUDA) || defined(WITH_HIP) +#include #endif -// TODO: put this stuff in torchvision namespace +namespace vision { +namespace ops { std::tuple ps_roi_pool( const at::Tensor& input, @@ -26,7 +21,7 @@ std::tuple ps_roi_pool( } #if defined(WITH_CUDA) || defined(WITH_HIP) -std::tuple PSROIPool_autocast( +std::tuple ps_roi_pool_autocast( const at::Tensor& input, const at::Tensor& rois, double spatial_scale, @@ -74,6 +69,8 @@ at::Tensor _ps_roi_pool_backward( width); } +namespace { + class PSROIPoolFunction : public torch::autograd::Function { public: static torch::autograd::variable_list forward( @@ -166,7 +163,9 @@ class PSROIPoolBackwardFunction } }; -std::tuple PSROIPool_autograd( +} // namespace + +std::tuple ps_roi_pool_autograd( const at::Tensor& input, const at::Tensor& rois, double spatial_scale, @@ -178,7 +177,7 @@ std::tuple PSROIPool_autograd( return std::make_tuple(result[0], result[1]); } -at::Tensor PSROIPool_backward_autograd( +at::Tensor ps_roi_pool_backward_autograd( const at::Tensor& grad, const at::Tensor& rois, const at::Tensor& channel_mapping, @@ -201,3 +200,6 @@ at::Tensor PSROIPool_backward_autograd( height, width)[0]; } + +} // namespace ops +} // namespace vision diff --git a/torchvision/csrc/ps_roi_pool.h b/torchvision/csrc/ps_roi_pool.h new file mode 100644 index 00000000000..20ae17d3ad1 --- /dev/null +++ b/torchvision/csrc/ps_roi_pool.h @@ -0,0 +1,67 @@ +#pragma once + +#include "cpu/ps_roi_pool_kernel.h" + +#ifdef WITH_CUDA +#include "cuda/ps_roi_pool_kernel.h" +#endif +#ifdef WITH_HIP +#include "hip/ps_roi_pool_kernel.h" +#endif + +namespace vision { +namespace ops { + +// C++ Forward +std::tuple ps_roi_pool( + const at::Tensor& input, + const at::Tensor& rois, + double spatial_scale, + int64_t pooled_height, + int64_t pooled_width); + +// Autocast Forward +#if defined(WITH_CUDA) || defined(WITH_HIP) +std::tuple ps_roi_pool_autocast( + const at::Tensor& input, + const at::Tensor& rois, + double spatial_scale, + int64_t pooled_height, + int64_t pooled_width); +#endif + +// C++ Backward +at::Tensor _ps_roi_pool_backward( + const at::Tensor& grad, + const at::Tensor& rois, + const at::Tensor& channel_mapping, + double spatial_scale, + int64_t pooled_height, + int64_t pooled_width, + int64_t batch_size, + int64_t channels, + int64_t height, + int64_t width); + +// Autograd Forward and Backward +std::tuple ps_roi_pool_autograd( + const at::Tensor& input, + const at::Tensor& rois, + double spatial_scale, + int64_t pooled_height, + int64_t pooled_width); + +at::Tensor ps_roi_pool_backward_autograd( + const at::Tensor& grad, + const at::Tensor& rois, + const at::Tensor& channel_mapping, + double spatial_scale, + int64_t pooled_height, + int64_t pooled_width, + int64_t batch_size, + int64_t channels, + int64_t height, + int64_t width); + +} // namespace ops +} // namespace vision diff --git a/torchvision/csrc/ROIAlign.h b/torchvision/csrc/roi_align.cpp similarity index 94% rename from torchvision/csrc/ROIAlign.h rename to torchvision/csrc/roi_align.cpp index 708981f061e..63643a6cb46 100644 --- a/torchvision/csrc/ROIAlign.h +++ b/torchvision/csrc/roi_align.cpp @@ -1,19 +1,13 @@ -#pragma once +#include "roi_align.h" +#include -#include "cpu/vision_cpu.h" - -#ifdef WITH_CUDA -#include "autocast.h" -#include "cuda/vision_cuda.h" -#endif -#ifdef WITH_HIP -#include "autocast.h" -#include "hip/vision_cuda.h" +#if defined(WITH_CUDA) || defined(WITH_HIP) +#include #endif -// TODO: put this stuff in torchvision namespace +namespace vision { +namespace ops { -// roi_align dispatch nexus at::Tensor roi_align( const at::Tensor& input, // Input feature map. const at::Tensor& rois, // List of ROIs to pool over. @@ -39,7 +33,7 @@ at::Tensor roi_align( } #if defined(WITH_CUDA) || defined(WITH_HIP) -at::Tensor ROIAlign_autocast( +at::Tensor roi_align_autocast( const at::Tensor& input, const at::Tensor& rois, double spatial_scale, @@ -90,6 +84,8 @@ at::Tensor _roi_align_backward( aligned); } +namespace { + class ROIAlignFunction : public torch::autograd::Function { public: static torch::autograd::variable_list forward( @@ -189,7 +185,9 @@ class ROIAlignBackwardFunction } }; -at::Tensor ROIAlign_autograd( +} // namespace + +at::Tensor roi_align_autograd( const at::Tensor& input, const at::Tensor& rois, double spatial_scale, @@ -207,7 +205,7 @@ at::Tensor ROIAlign_autograd( aligned)[0]; } -at::Tensor ROIAlign_backward_autograd( +at::Tensor roi_align_backward_autograd( const at::Tensor& grad, const at::Tensor& rois, double spatial_scale, @@ -232,3 +230,6 @@ at::Tensor ROIAlign_backward_autograd( sampling_ratio, aligned)[0]; } + +} // namespace ops +} // namespace vision diff --git a/torchvision/csrc/roi_align.h b/torchvision/csrc/roi_align.h new file mode 100644 index 00000000000..1e92c8d2134 --- /dev/null +++ b/torchvision/csrc/roi_align.h @@ -0,0 +1,75 @@ +#pragma once + +#include "cpu/roi_align_kernel.h" + +#ifdef WITH_CUDA +#include "cuda/roi_align_kernel.h" +#endif +#ifdef WITH_HIP +#include "hip/roi_align_kernel.h" +#endif + +namespace vision { +namespace ops { + +// C++ Forward +at::Tensor roi_align( + const at::Tensor& input, + const at::Tensor& rois, + double spatial_scale, + int64_t pooled_height, + int64_t pooled_width, + int64_t sampling_ratio, + bool aligned); + +// Autocast Forward +#if defined(WITH_CUDA) || defined(WITH_HIP) +at::Tensor roi_align_autocast( + const at::Tensor& input, + const at::Tensor& rois, + double spatial_scale, + int64_t pooled_height, + int64_t pooled_width, + int64_t sampling_ratio, + bool aligned); +#endif + +// C++ Backward +at::Tensor _roi_align_backward( + const at::Tensor& grad, + const at::Tensor& rois, + double spatial_scale, + int64_t pooled_height, + int64_t pooled_width, + int64_t batch_size, + int64_t channels, + int64_t height, + int64_t width, + int64_t sampling_ratio, + bool aligned); + +// Autograd Forward and Backward +at::Tensor roi_align_autograd( + const at::Tensor& input, + const at::Tensor& rois, + double spatial_scale, + int64_t pooled_height, + int64_t pooled_width, + int64_t sampling_ratio, + bool aligned); + +at::Tensor roi_align_backward_autograd( + const at::Tensor& grad, + const at::Tensor& rois, + double spatial_scale, + int64_t pooled_height, + int64_t pooled_width, + int64_t batch_size, + int64_t channels, + int64_t height, + int64_t width, + int64_t sampling_ratio, + bool aligned); + +} // namespace ops +} // namespace vision diff --git a/torchvision/csrc/ROIPool.h b/torchvision/csrc/roi_pool.cpp similarity index 93% rename from torchvision/csrc/ROIPool.h rename to torchvision/csrc/roi_pool.cpp index 7950005f1bd..b2948e6dd23 100644 --- a/torchvision/csrc/ROIPool.h +++ b/torchvision/csrc/roi_pool.cpp @@ -1,17 +1,12 @@ -#pragma once +#include "roi_pool.h" +#include -#include "cpu/vision_cpu.h" - -#ifdef WITH_CUDA -#include "autocast.h" -#include "cuda/vision_cuda.h" -#endif -#ifdef WITH_HIP -#include "autocast.h" -#include "hip/vision_cuda.h" +#if defined(WITH_CUDA) || defined(WITH_HIP) +#include #endif -// TODO: put this stuff in torchvision namespace +namespace vision { +namespace ops { std::tuple roi_pool( const at::Tensor& input, @@ -26,7 +21,7 @@ std::tuple roi_pool( } #if defined(WITH_CUDA) || defined(WITH_HIP) -std::tuple ROIPool_autocast( +std::tuple roi_pool_autocast( const at::Tensor& input, const at::Tensor& rois, double spatial_scale, @@ -73,6 +68,8 @@ at::Tensor _roi_pool_backward( width); } +namespace { + class ROIPoolFunction : public torch::autograd::Function { public: static torch::autograd::variable_list forward( @@ -165,7 +162,9 @@ class ROIPoolBackwardFunction } }; -std::tuple ROIPool_autograd( +} // namespace + +std::tuple roi_pool_autograd( const at::Tensor& input, const at::Tensor& rois, double spatial_scale, @@ -177,7 +176,7 @@ std::tuple ROIPool_autograd( return std::make_tuple(result[0], result[1]); } -at::Tensor ROIPool_backward_autograd( +at::Tensor roi_pool_backward_autograd( const at::Tensor& grad, const at::Tensor& rois, const at::Tensor& argmax, @@ -200,3 +199,6 @@ at::Tensor ROIPool_backward_autograd( height, width)[0]; } + +} // namespace ops +} // namespace vision diff --git a/torchvision/csrc/roi_pool.h b/torchvision/csrc/roi_pool.h new file mode 100644 index 00000000000..ac005914107 --- /dev/null +++ b/torchvision/csrc/roi_pool.h @@ -0,0 +1,67 @@ +#pragma once + +#include "cpu/roi_pool_kernel.h" + +#ifdef WITH_CUDA +#include "cuda/roi_pool_kernel.h" +#endif +#ifdef WITH_HIP +#include "hip/roi_pool_kernel.h" +#endif + +namespace vision { +namespace ops { + +// C++ Forward +std::tuple roi_pool( + const at::Tensor& input, + const at::Tensor& rois, + double spatial_scale, + int64_t pooled_height, + int64_t pooled_width); + +// Autocast Forward +#if defined(WITH_CUDA) || defined(WITH_HIP) +std::tuple roi_pool_autocast( + const at::Tensor& input, + const at::Tensor& rois, + double spatial_scale, + int64_t pooled_height, + int64_t pooled_width); +#endif + +// C++ Backward +at::Tensor _roi_pool_backward( + const at::Tensor& grad, + const at::Tensor& rois, + const at::Tensor& argmax, + double spatial_scale, + int64_t pooled_height, + int64_t pooled_width, + int64_t batch_size, + int64_t channels, + int64_t height, + int64_t width); + +// Autograd Forward and Backward +std::tuple roi_pool_autograd( + const at::Tensor& input, + const at::Tensor& rois, + double spatial_scale, + int64_t pooled_height, + int64_t pooled_width); + +at::Tensor roi_pool_backward_autograd( + const at::Tensor& grad, + const at::Tensor& rois, + const at::Tensor& argmax, + double spatial_scale, + int64_t pooled_height, + int64_t pooled_width, + int64_t batch_size, + int64_t channels, + int64_t height, + int64_t width); + +} // namespace ops +} // namespace vision diff --git a/torchvision/csrc/vision.cpp b/torchvision/csrc/vision.cpp index 40865975c9c..766ecd5ff69 100644 --- a/torchvision/csrc/vision.cpp +++ b/torchvision/csrc/vision.cpp @@ -1,3 +1,5 @@ +#include "vision.h" + #include #include @@ -8,13 +10,13 @@ #include #endif -#include "DeformConv.h" -#include "PSROIAlign.h" -#include "PSROIPool.h" -#include "ROIAlign.h" -#include "ROIPool.h" -#include "empty_tensor_op.h" +#include "deform_conv2d.h" +#include "new_empty_tensor_op.h" #include "nms.h" +#include "ps_roi_align.h" +#include "ps_roi_pool.h" +#include "roi_align.h" +#include "roi_pool.h" // If we are in a Windows environment, we need to define // initialization functions for the _custom_ops extension @@ -35,6 +37,8 @@ int64_t cuda_version() { } } // namespace vision +using namespace vision::ops; + TORCH_LIBRARY(torchvision, m) { m.def( "deform_conv2d(Tensor input, Tensor weight, Tensor offset, Tensor mask, Tensor bias, int stride_h, int stride_w, int pad_h, int pad_w, int dilation_h, int dilation_w, int groups, int offset_groups, bool use_mask) -> Tensor"); @@ -62,57 +66,57 @@ TORCH_LIBRARY(torchvision, m) { } TORCH_LIBRARY_IMPL(torchvision, CPU, m) { - m.impl("deform_conv2d", DeformConv2d_forward_cpu); - m.impl("_deform_conv2d_backward", DeformConv2d_backward_cpu); + m.impl("deform_conv2d", deform_conv2d_forward_cpu); + m.impl("_deform_conv2d_backward", deform_conv2d_backward_cpu); m.impl("nms", nms_cpu); - m.impl("ps_roi_align", PSROIAlign_forward_cpu); - m.impl("_ps_roi_align_backward", PSROIAlign_backward_cpu); - m.impl("ps_roi_pool", PSROIPool_forward_cpu); - m.impl("_ps_roi_pool_backward", PSROIPool_backward_cpu); - m.impl("roi_align", ROIAlign_forward_cpu); - m.impl("_roi_align_backward", ROIAlign_backward_cpu); - m.impl("roi_pool", ROIPool_forward_cpu); - m.impl("_roi_pool_backward", ROIPool_backward_cpu); + m.impl("ps_roi_align", ps_roi_align_forward_cpu); + m.impl("_ps_roi_align_backward", ps_roi_align_backward_cpu); + m.impl("ps_roi_pool", ps_roi_pool_forward_cpu); + m.impl("_ps_roi_pool_backward", ps_roi_pool_backward_cpu); + m.impl("roi_align", roi_align_forward_cpu); + m.impl("_roi_align_backward", roi_align_backward_cpu); + m.impl("roi_pool", roi_pool_forward_cpu); + m.impl("_roi_pool_backward", roi_pool_backward_cpu); } // TODO: Place this in a hypothetical separate torchvision_cuda library #if defined(WITH_CUDA) || defined(WITH_HIP) TORCH_LIBRARY_IMPL(torchvision, CUDA, m) { - m.impl("deform_conv2d", DeformConv2d_forward_cuda); - m.impl("_deform_conv2d_backward", DeformConv2d_backward_cuda); + m.impl("deform_conv2d", deform_conv2d_forward_cuda); + m.impl("_deform_conv2d_backward", deform_conv2d_backward_cuda); m.impl("nms", nms_cuda); - m.impl("ps_roi_align", PSROIAlign_forward_cuda); - m.impl("_ps_roi_align_backward", PSROIAlign_backward_cuda); - m.impl("ps_roi_pool", PSROIPool_forward_cuda); - m.impl("_ps_roi_pool_backward", PSROIPool_backward_cuda); - m.impl("roi_align", ROIAlign_forward_cuda); - m.impl("_roi_align_backward", ROIAlign_backward_cuda); - m.impl("roi_pool", ROIPool_forward_cuda); - m.impl("_roi_pool_backward", ROIPool_backward_cuda); + m.impl("ps_roi_align", ps_roi_align_forward_cuda); + m.impl("_ps_roi_align_backward", ps_roi_align_backward_cuda); + m.impl("ps_roi_pool", ps_roi_pool_forward_cuda); + m.impl("_ps_roi_pool_backward", ps_roi_pool_backward_cuda); + m.impl("roi_align", roi_align_forward_cuda); + m.impl("_roi_align_backward", roi_align_backward_cuda); + m.impl("roi_pool", roi_pool_forward_cuda); + m.impl("_roi_pool_backward", roi_pool_backward_cuda); } #endif // Autocast only needs to wrap forward pass ops. #if defined(WITH_CUDA) || defined(WITH_HIP) TORCH_LIBRARY_IMPL(torchvision, Autocast, m) { - m.impl("deform_conv2d", DeformConv2d_autocast); + m.impl("deform_conv2d", deform_conv2d_autocast); m.impl("nms", nms_autocast); - m.impl("ps_roi_align", PSROIAlign_autocast); - m.impl("ps_roi_pool", PSROIPool_autocast); - m.impl("roi_align", ROIAlign_autocast); - m.impl("roi_pool", ROIPool_autocast); + m.impl("ps_roi_align", ps_roi_align_autocast); + m.impl("ps_roi_pool", ps_roi_pool_autocast); + m.impl("roi_align", roi_align_autocast); + m.impl("roi_pool", roi_pool_autocast); } #endif TORCH_LIBRARY_IMPL(torchvision, Autograd, m) { - m.impl("deform_conv2d", DeformConv2d_autograd); - m.impl("_deform_conv2d_backward", DeformConv2d_backward_autograd); - m.impl("ps_roi_align", PSROIAlign_autograd); - m.impl("_ps_roi_align_backward", PSROIAlign_backward_autograd); - m.impl("ps_roi_pool", PSROIPool_autograd); - m.impl("_ps_roi_pool_backward", PSROIPool_backward_autograd); - m.impl("roi_align", ROIAlign_autograd); - m.impl("_roi_align_backward", ROIAlign_backward_autograd); - m.impl("roi_pool", ROIPool_autograd); - m.impl("_roi_pool_backward", ROIPool_backward_autograd); + m.impl("deform_conv2d", deform_conv2d_autograd); + m.impl("_deform_conv2d_backward", deform_conv2d_backward_autograd); + m.impl("ps_roi_align", ps_roi_align_autograd); + m.impl("_ps_roi_align_backward", ps_roi_align_backward_autograd); + m.impl("ps_roi_pool", ps_roi_pool_autograd); + m.impl("_ps_roi_pool_backward", ps_roi_pool_backward_autograd); + m.impl("roi_align", roi_align_autograd); + m.impl("_roi_align_backward", roi_align_backward_autograd); + m.impl("roi_pool", roi_pool_autograd); + m.impl("_roi_pool_backward", roi_pool_backward_autograd); } diff --git a/torchvision/csrc/vision.h b/torchvision/csrc/vision.h index 4122284c083..806ebef7589 100644 --- a/torchvision/csrc/vision.h +++ b/torchvision/csrc/vision.h @@ -1,7 +1,5 @@ -#ifndef VISION_H -#define VISION_H +#pragma once -#include #include #include "macros.h" @@ -15,5 +13,3 @@ namespace detail { VISION_INLINE_VARIABLE int64_t _cuda_version = cuda_version(); } // namespace detail } // namespace vision - -#endif // VISION_H diff --git a/travis-scripts/run-clang-format/run-clang-format.py b/travis-scripts/run-clang-format/run-clang-format.py index 54e193db45b..fd2913bd70e 100755 --- a/travis-scripts/run-clang-format/run-clang-format.py +++ b/travis-scripts/run-clang-format/run-clang-format.py @@ -28,7 +28,7 @@ DEVNULL = open(os.devnull, "wb") -DEFAULT_EXTENSIONS = 'c,h,C,H,cpp,hpp,cc,hh,c++,h++,cxx,hxx' +DEFAULT_EXTENSIONS = 'c,h,C,H,cpp,hpp,cc,hh,c++,h++,cxx,hxx,cu' class ExitStatus: