Skip to content

Encapsulate and standardize C++ Ops - Clean up #3094

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Dec 2, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,8 +152,8 @@ def get_extensions():
)
source_cuda = glob.glob(os.path.join(extensions_dir, 'hip', '*.hip'))
# Copy over additional files
shutil.copy("torchvision/csrc/cuda/cuda_helpers.h", "torchvision/csrc/hip/cuda_helpers.h")
shutil.copy("torchvision/csrc/cuda/vision_cuda.h", "torchvision/csrc/hip/vision_cuda.h")
for file in glob.glob(r"torchvision/csrc/cuda/*.h"):
shutil.copy(file, "torchvision/csrc/hip")

else:
source_cuda = glob.glob(os.path.join(extensions_dir, 'cuda', '*.cu'))
Expand Down
1 change: 0 additions & 1 deletion test/tracing/frcnn/test_frcnn_tracing.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
#include <torch/script.h>
#include <torch/torch.h>
#include <torchvision/roi_align.h>
#include <torchvision/cpu/vision_cpu.h>
#include <torchvision/nms.h>

#ifdef _WIN32
Expand Down
7 changes: 0 additions & 7 deletions torchvision/csrc/autocast.h

This file was deleted.

4 changes: 2 additions & 2 deletions torchvision/csrc/cpu/deform_conv2d_kernel.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -350,7 +350,7 @@ void compute_grad_input(
channels * weight_h * weight_w * out_h * out_w * parallel_imgs;

AT_DISPATCH_FLOATING_TYPES_AND_HALF(
columns.scalar_type(), "deformable_col2im", ([&] {
columns.scalar_type(), "compute_grad_input", ([&] {
deformable_col2im_kernel(
num_kernels,
columns.data_ptr<scalar_t>(),
Expand Down Expand Up @@ -551,7 +551,7 @@ void compute_grad_offset_and_mask(
out_h * out_w * 2 * weight_h * weight_w * n_offset_grps * parallel_imgs;

AT_DISPATCH_FLOATING_TYPES_AND_HALF(
columns.scalar_type(), "deformable_col2im_coord", ([&] {
columns.scalar_type(), "compute_grad_offset_and_mask", ([&] {
deformable_col2im_coord_kernel(
num_kernels,
columns.data_ptr<scalar_t>(),
Expand Down
6 changes: 3 additions & 3 deletions torchvision/csrc/cpu/nms_kernel.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
namespace {

template <typename scalar_t>
at::Tensor nms_kernel(
at::Tensor nms_kernel_impl(
const at::Tensor& dets,
const at::Tensor& scores,
double iou_threshold) {
Expand Down Expand Up @@ -98,8 +98,8 @@ at::Tensor nms_cpu(

auto result = at::empty({0}, dets.options());

AT_DISPATCH_FLOATING_TYPES(dets.scalar_type(), "nms", [&] {
result = nms_kernel<scalar_t>(dets, scores, iou_threshold);
AT_DISPATCH_FLOATING_TYPES(dets.scalar_type(), "nms_cpu", [&] {
result = nms_kernel_impl<scalar_t>(dets, scores, iou_threshold);
});
return result;
}
4 changes: 2 additions & 2 deletions torchvision/csrc/cpu/ps_roi_align_kernel.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -340,7 +340,7 @@ std::tuple<at::Tensor, at::Tensor> ps_roi_align_forward_cpu(

auto input_ = input.contiguous(), rois_ = rois.contiguous();
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
input.scalar_type(), "ps_roi_align_forward", [&] {
input.scalar_type(), "ps_roi_align_forward_cpu", [&] {
ps_roi_align_forward_kernel_impl<scalar_t>(
output_size,
input_.data_ptr<scalar_t>(),
Expand Down Expand Up @@ -397,7 +397,7 @@ at::Tensor ps_roi_align_backward_cpu(

auto grad_ = grad.contiguous(), rois_ = rois.contiguous();
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
grad.scalar_type(), "ps_roi_align_backward", [&] {
grad.scalar_type(), "ps_roi_align_backward_cpu", [&] {
ps_roi_align_backward_kernel_impl<scalar_t>(
grad.numel(),
grad_.data_ptr<scalar_t>(),
Expand Down
4 changes: 2 additions & 2 deletions torchvision/csrc/cpu/ps_roi_pool_kernel.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,7 @@ std::tuple<at::Tensor, at::Tensor> ps_roi_pool_forward_cpu(

auto input_ = input.contiguous(), rois_ = rois.contiguous();
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
input.scalar_type(), "ps_roi_pool_forward", [&] {
input.scalar_type(), "ps_roi_pool_forward_cpu", [&] {
ps_roi_pool_forward_kernel_impl<scalar_t>(
input_.data_ptr<scalar_t>(),
spatial_scale,
Expand Down Expand Up @@ -238,7 +238,7 @@ at::Tensor ps_roi_pool_backward_cpu(

auto grad_ = grad.contiguous(), rois_ = rois.contiguous();
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
grad.scalar_type(), "ps_roi_pool_backward", [&] {
grad.scalar_type(), "ps_roi_pool_backward_cpu", [&] {
ps_roi_pool_backward_kernel_impl<scalar_t>(
grad_.data_ptr<scalar_t>(),
channel_mapping.data_ptr<int>(),
Expand Down
4 changes: 2 additions & 2 deletions torchvision/csrc/cpu/roi_align_kernel.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -419,7 +419,7 @@ at::Tensor roi_align_forward_cpu(

auto input_ = input.contiguous(), rois_ = rois.contiguous();
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
input.scalar_type(), "roi_align_forward", [&] {
input.scalar_type(), "roi_align_forward_cpu", [&] {
roi_align_forward_kernel_impl<scalar_t>(
output_size,
input_.data_ptr<scalar_t>(),
Expand Down Expand Up @@ -473,7 +473,7 @@ at::Tensor roi_align_backward_cpu(

auto rois_ = rois.contiguous();
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
grad.scalar_type(), "roi_align_forward", [&] {
grad.scalar_type(), "roi_align_backward_cpu", [&] {
roi_align_backward_kernel_impl<scalar_t>(
grad.numel(),
grad.data_ptr<scalar_t>(),
Expand Down
4 changes: 2 additions & 2 deletions torchvision/csrc/cpu/roi_pool_kernel.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -154,7 +154,7 @@ std::tuple<at::Tensor, at::Tensor> roi_pool_forward_cpu(

auto input_ = input.contiguous(), rois_ = rois.contiguous();
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
input.scalar_type(), "roi_pool_forward", [&] {
input.scalar_type(), "roi_pool_forward_cpu", [&] {
roi_pool_forward_kernel_impl<scalar_t>(
input_.data_ptr<scalar_t>(),
spatial_scale,
Expand Down Expand Up @@ -212,7 +212,7 @@ at::Tensor roi_pool_backward_cpu(

auto rois_ = rois.contiguous();
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
grad.scalar_type(), "roi_pool_backward", [&] {
grad.scalar_type(), "roi_pool_backward_cpu", [&] {
roi_pool_backward_kernel_impl<scalar_t>(
grad.data_ptr<scalar_t>(),
argmax.data_ptr<int>(),
Expand Down
4 changes: 1 addition & 3 deletions torchvision/csrc/cpu/video/register.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
#ifndef REGISTER_H
#define REGISTER_H
#pragma once

#include "Video.h"

Expand All @@ -15,4 +14,3 @@ static auto registerVideo =
.def("next", &Video::Next);

} // namespace
#endif
5 changes: 0 additions & 5 deletions torchvision/csrc/cpu/vision_cpu.h

This file was deleted.

23 changes: 9 additions & 14 deletions torchvision/csrc/cuda/deform_conv2d_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,6 @@
// modified from
// https://github.com/open-mmlab/mmdetection/blob/master/mmdet/ops/dcn/src/deform_conv_cuda.cpp

#include <ATen/ATen.h>
#include <ATen/cuda/CUDAContext.h>
#include <c10/cuda/CUDAGuard.h>
#include <THC/THCAtomics.cuh>
Expand All @@ -88,7 +87,9 @@ inline unsigned int GET_THREADS() {
return 512;
}

inline unsigned int GET_BLOCKS(const unsigned int THREADS, const unsigned int N) {
inline unsigned int GET_BLOCKS(
const unsigned int THREADS,
const unsigned int N) {
unsigned int kMaxGridNum =
at::cuda::getCurrentDeviceProperties()->maxGridSize[0];
return std::min(kMaxGridNum, (N + THREADS - 1) / THREADS);
Expand Down Expand Up @@ -235,10 +236,8 @@ void deformable_im2col(
const unsigned int blocks = GET_BLOCKS(threads, num_kernels);

AT_DISPATCH_FLOATING_TYPES_AND_HALF(
input.scalar_type(), "deformable_im2col_gpu", ([&] {
deformable_im2col_kernel<<<
blocks,
threads>>>(
input.scalar_type(), "deformable_im2col", ([&] {
deformable_im2col_kernel<<<blocks, threads>>>(
num_kernels,
input.data_ptr<scalar_t>(),
data_offset.data_ptr<scalar_t>(),
Expand Down Expand Up @@ -381,10 +380,8 @@ void compute_grad_input(
const unsigned int blocks = GET_BLOCKS(threads, num_kernels);

AT_DISPATCH_FLOATING_TYPES_AND_HALF(
columns.scalar_type(), "deformable_col2im_gpu", ([&] {
deformable_col2im_kernel<<<
blocks,
threads>>>(
columns.scalar_type(), "compute_grad_input", ([&] {
deformable_col2im_kernel<<<blocks, threads>>>(
num_kernels,
columns.data_ptr<scalar_t>(),
offset.data_ptr<scalar_t>(),
Expand Down Expand Up @@ -589,10 +586,8 @@ void compute_grad_offset_and_mask(
const unsigned int blocks = GET_BLOCKS(threads, num_kernels);

AT_DISPATCH_FLOATING_TYPES_AND_HALF(
columns.scalar_type(), "deformable_col2im_coord_gpu", ([&] {
deformable_col2im_coord_kernel<<<
blocks,
threads>>>(
columns.scalar_type(), "compute_grad_offset_and_mask", ([&] {
deformable_col2im_coord_kernel<<<blocks, threads>>>(
num_kernels,
columns.data_ptr<scalar_t>(),
input.data_ptr<scalar_t>(),
Expand Down
13 changes: 7 additions & 6 deletions torchvision/csrc/cuda/nms_kernel.cu
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
#include <ATen/ATen.h>
#include <ATen/cuda/CUDAContext.h>
#include <c10/cuda/CUDAGuard.h>

Expand All @@ -24,7 +23,7 @@ __device__ inline bool devIoU(
}

template <typename T>
__global__ void nms_kernel(
__global__ void nms_kernel_impl(
int n_boxes,
double iou_threshold,
const T* dev_boxes,
Expand Down Expand Up @@ -74,7 +73,8 @@ __global__ void nms_kernel(

} // namespace

at::Tensor nms_cuda(const at::Tensor& dets,
at::Tensor nms_cuda(
const at::Tensor& dets,
const at::Tensor& scores,
double iou_threshold) {
TORCH_CHECK(dets.is_cuda(), "dets must be a CUDA tensor");
Expand Down Expand Up @@ -124,16 +124,17 @@ at::Tensor nms_cuda(const at::Tensor& dets,
cudaStream_t stream = at::cuda::getCurrentCUDAStream();

AT_DISPATCH_FLOATING_TYPES_AND_HALF(
dets_sorted.scalar_type(), "nms_kernel_cuda", [&] {
nms_kernel<scalar_t><<<blocks, threads, 0, stream>>>(
dets_sorted.scalar_type(), "nms_cuda", [&] {
nms_kernel_impl<scalar_t><<<blocks, threads, 0, stream>>>(
dets_num,
iou_threshold,
dets_sorted.data_ptr<scalar_t>(),
(unsigned long long*)mask.data_ptr<int64_t>());
});

at::Tensor mask_cpu = mask.to(at::kCPU);
unsigned long long* mask_host = (unsigned long long*)mask_cpu.data_ptr<int64_t>();
unsigned long long* mask_host =
(unsigned long long*)mask_cpu.data_ptr<int64_t>();

std::vector<unsigned long long> remv(col_blocks);
memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks);
Expand Down
17 changes: 7 additions & 10 deletions torchvision/csrc/cuda/ps_roi_align_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -339,14 +339,13 @@ std::tuple<at::Tensor, at::Tensor> ps_roi_align_forward_cuda(
cudaStream_t stream = at::cuda::getCurrentCUDAStream();

dim3 grid(std::min(
ceil_div(static_cast<int64_t>(output_size), static_cast<int64_t>(512)),
ceil_div(static_cast<int64_t>(output_size), static_cast<int64_t>(512)),
static_cast<int64_t>(4096)));
dim3 block(512);

auto input_ = input.contiguous(),
rois_ = rois.contiguous();
auto input_ = input.contiguous(), rois_ = rois.contiguous();
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
input.scalar_type(), "ps_roi_align_forward", [&] {
input.scalar_type(), "ps_roi_align_forward_cuda", [&] {
ps_roi_align_forward_kernel_impl<scalar_t><<<grid, block, 0, stream>>>(
output_size,
input_.data_ptr<scalar_t>(),
Expand Down Expand Up @@ -383,8 +382,7 @@ at::Tensor ps_roi_align_backward_cuda(
TORCH_CHECK(grad.is_cuda(), "grad must be a CUDA tensor");
TORCH_CHECK(rois.is_cuda(), "rois must be a CUDA tensor");
TORCH_CHECK(
channel_mapping.is_cuda(),
"channel_mapping must be a CUDA tensor");
channel_mapping.is_cuda(), "channel_mapping must be a CUDA tensor");

at::TensorArg grad_t{grad, "grad", 1}, rois_t{rois, "rois", 2},
channel_mapping_t{channel_mapping, "channel_mapping", 3};
Expand All @@ -402,7 +400,7 @@ at::Tensor ps_roi_align_backward_cuda(
cudaStream_t stream = at::cuda::getCurrentCUDAStream();

dim3 grid(std::min(
ceil_div(static_cast<int64_t>(grad.numel()), static_cast<int64_t>(512)),
ceil_div(static_cast<int64_t>(grad.numel()), static_cast<int64_t>(512)),
static_cast<int64_t>(4096)));
dim3 block(512);

Expand All @@ -414,10 +412,9 @@ at::Tensor ps_roi_align_backward_cuda(

int channels_out = channels / (pooled_height * pooled_width);

auto grad_ = grad.contiguous(),
rois_ = rois.contiguous();
auto grad_ = grad.contiguous(), rois_ = rois.contiguous();
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
grad.scalar_type(), "ps_roi_align_backward", [&] {
grad.scalar_type(), "ps_roi_align_backward_cuda", [&] {
ps_roi_align_backward_kernel_impl<scalar_t><<<grid, block, 0, stream>>>(
grad.numel(),
grad_.data_ptr<scalar_t>(),
Expand Down
17 changes: 7 additions & 10 deletions torchvision/csrc/cuda/ps_roi_pool_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -179,14 +179,13 @@ std::tuple<at::Tensor, at::Tensor> ps_roi_pool_forward_cuda(
cudaStream_t stream = at::cuda::getCurrentCUDAStream();

dim3 grid(std::min(
ceil_div(static_cast<int64_t>(output_size), static_cast<int64_t>(512)),
ceil_div(static_cast<int64_t>(output_size), static_cast<int64_t>(512)),
static_cast<int64_t>(4096)));
dim3 block(512);

auto input_ = input.contiguous(),
rois_ = rois.contiguous();
auto input_ = input.contiguous(), rois_ = rois.contiguous();
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
input.scalar_type(), "ps_roi_pool_forward", [&] {
input.scalar_type(), "ps_roi_pool_forward_cuda", [&] {
ps_roi_pool_forward_kernel_impl<scalar_t><<<grid, block, 0, stream>>>(
output_size,
input_.data_ptr<scalar_t>(),
Expand Down Expand Up @@ -220,8 +219,7 @@ at::Tensor ps_roi_pool_backward_cuda(
TORCH_CHECK(grad.is_cuda(), "grad must be a CUDA tensor");
TORCH_CHECK(rois.is_cuda(), "rois must be a CUDA tensor");
TORCH_CHECK(
channel_mapping.is_cuda(),
"channel_mapping must be a CUDA tensor");
channel_mapping.is_cuda(), "channel_mapping must be a CUDA tensor");

at::TensorArg grad_t{grad, "grad", 1}, rois_t{rois, "rois", 2},
channel_mapping_t{channel_mapping, "channel_mapping", 3};
Expand All @@ -239,7 +237,7 @@ at::Tensor ps_roi_pool_backward_cuda(
cudaStream_t stream = at::cuda::getCurrentCUDAStream();

dim3 grid(std::min(
ceil_div(static_cast<int64_t>(grad.numel()), static_cast<int64_t>(512)),
ceil_div(static_cast<int64_t>(grad.numel()), static_cast<int64_t>(512)),
static_cast<int64_t>(4096)));
dim3 block(512);

Expand All @@ -251,10 +249,9 @@ at::Tensor ps_roi_pool_backward_cuda(

int channels_out = channels / (pooled_height * pooled_width);

auto grad_ = grad.contiguous(),
rois_ = rois.contiguous();
auto grad_ = grad.contiguous(), rois_ = rois.contiguous();
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
grad.scalar_type(), "ps_roi_pool_backward", [&] {
grad.scalar_type(), "ps_roi_pool_backward_cuda", [&] {
ps_roi_pool_backward_kernel_impl<scalar_t><<<grid, block, 0, stream>>>(
grad.numel(),
grad_.data_ptr<scalar_t>(),
Expand Down
Loading