From bd7f639be51480416c8fb3e36accb035ff1f5208 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Wed, 31 Mar 2021 16:36:54 +0100 Subject: [PATCH 01/26] WIP --- .../ops/quantized/cpu/qroi_align_kernel.cpp | 286 ++++++++++++++++++ 1 file changed, 286 insertions(+) create mode 100644 torchvision/csrc/ops/quantized/cpu/qroi_align_kernel.cpp diff --git a/torchvision/csrc/ops/quantized/cpu/qroi_align_kernel.cpp b/torchvision/csrc/ops/quantized/cpu/qroi_align_kernel.cpp new file mode 100644 index 00000000000..2926259a70a --- /dev/null +++ b/torchvision/csrc/ops/quantized/cpu/qroi_align_kernel.cpp @@ -0,0 +1,286 @@ +#include +#include +#include +#include +#include + +namespace vision { +namespace ops { + +namespace { + +template +struct PreCalc { + int pos1; + int pos2; + int pos3; + int pos4; + T w1; + T w2; + T w3; + T w4; +}; + +template +void pre_calc_for_bilinear_interpolate( + int height, + int width, + int pooled_height, + int pooled_width, + int iy_upper, + int ix_upper, + T roi_start_h, + T roi_start_w, + T bin_size_h, + T bin_size_w, + int roi_bin_grid_h, + int roi_bin_grid_w, + std::vector>& pre_calc) { + int pre_calc_index = 0; + for (int ph = 0; ph < pooled_height; ph++) { + for (int pw = 0; pw < pooled_width; pw++) { + for (int iy = 0; iy < iy_upper; iy++) { + const T yy = roi_start_h + ph * bin_size_h + + static_cast(iy + .5f) * bin_size_h / + static_cast(roi_bin_grid_h); // e.g., 0.5, 1.5 + for (int ix = 0; ix < ix_upper; ix++) { + const T xx = roi_start_w + pw * bin_size_w + + static_cast(ix + .5f) * bin_size_w / + static_cast(roi_bin_grid_w); + + T x = xx; + T y = yy; + // deal with: inverse elements are out of feature map boundary + if (y < -1.0 || y > height || x < -1.0 || x > width) { + // empty + PreCalc pc; + pc.pos1 = 0; + pc.pos2 = 0; + pc.pos3 = 0; + pc.pos4 = 0; + pc.w1 = 0; + pc.w2 = 0; + pc.w3 = 0; + pc.w4 = 0; + pre_calc[pre_calc_index] = pc; + pre_calc_index += 1; + continue; + } + + if (y <= 0) { + y = 0; + } + if (x <= 0) { + x = 0; + } + + int y_low = (int)y; + int x_low = (int)x; + int y_high; + int x_high; + + if (y_low >= height - 1) { + y_high = y_low = height - 1; + y = (T)y_low; + } else { + y_high = y_low + 1; + } + + if (x_low >= width - 1) { + x_high = x_low = width - 1; + x = (T)x_low; + } else { + x_high = x_low + 1; + } + + T ly = y - y_low; + T lx = x - x_low; + T hy = 1. - ly, hx = 1. - lx; + T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx; + + // save weights and indeces + PreCalc pc; + pc.pos1 = y_low * width + x_low; + pc.pos2 = y_low * width + x_high; + pc.pos3 = y_high * width + x_low; + pc.pos4 = y_high * width + x_high; + pc.w1 = w1; + pc.w2 = w2; + pc.w3 = w3; + pc.w4 = w4; + pre_calc[pre_calc_index] = pc; + + pre_calc_index += 1; + } + } + } + } +} + + +template +void qroi_align_forward_kernel_impl( + int n_rois, + const T* input, + double & spatial_scale, + int channels, + int height, + int width, + int pooled_height, + int pooled_width, + int sampling_ratio, + bool aligned, + const T* rois, + T* output, + int64_t output_size) { + + for (int64_t i = 0; i < output_size; i++) { + output[i].val_ = 0; + } + + for (int n = 0; n < n_rois; n++) { + int index_n = n * channels * pooled_width * pooled_height; + + const T* offset_rois = rois + n * 5; + int roi_batch_ind = offset_rois[0].val_; + + // Do not using rounding; this implementation detail is critical + // T offset = aligned ? (T)0.5 : (T)0.0; + int offset = 0; // TODO fix this + float roi_start_w = offset_rois[1].val_ * spatial_scale - offset; + float roi_start_h = offset_rois[2].val_ * spatial_scale - offset; + float roi_end_w = offset_rois[3].val_ * spatial_scale - offset; + float roi_end_h = offset_rois[4].val_ * spatial_scale - offset; + + float roi_width = roi_end_w - roi_start_w; + float roi_height = roi_end_h - roi_start_h; + if (!aligned) { + // Force malformed ROIs to be 1x1 + roi_width = std::max(roi_width, 1.f); + roi_height = std::max(roi_height, 1.f); + } + + float bin_size_h = roi_height / pooled_height; + float bin_size_w = roi_width / pooled_width; + + // We use roi_bin_grid to sample the grid and mimic integral + int roi_bin_grid_h = (sampling_ratio > 0) + ? sampling_ratio + : ceil(roi_height / pooled_height); // e.g., = 2 + int roi_bin_grid_w = + (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width); + + // We do average (integral) pooling inside a bin + // When the grid is empty, output zeros. + const int count = std::max(roi_bin_grid_h * roi_bin_grid_w, 1); // e.g. = 4 + + // we want to precalculate indeces and weights shared by all chanels, + // this is the key point of optimiation + std::vector> pre_calc( + roi_bin_grid_h * roi_bin_grid_w * pooled_width * pooled_height); + pre_calc_for_bilinear_interpolate( + height, + width, + pooled_height, + pooled_width, + roi_bin_grid_h, + roi_bin_grid_w, + roi_start_h, + roi_start_w, + bin_size_h, + bin_size_w, + roi_bin_grid_h, + roi_bin_grid_w, + pre_calc); + + for (int c = 0; c < channels; c++) { + int index_n_c = index_n + c * pooled_width * pooled_height; + const T* offset_input = + input + (roi_batch_ind * channels + c) * height * width; + int pre_calc_index = 0; + + for (int ph = 0; ph < pooled_height; ph++) { + for (int pw = 0; pw < pooled_width; pw++) { + int index = index_n_c + ph * pooled_width + pw; + + float output_val = 0.; + for (int iy = 0; iy < roi_bin_grid_h; iy++) { + for (int ix = 0; ix < roi_bin_grid_w; ix++) { + PreCalc pc = pre_calc[pre_calc_index]; + output_val += pc.w1 * offset_input[pc.pos1].val_ + + pc.w2 * offset_input[pc.pos2].val_ + + pc.w3 * offset_input[pc.pos3].val_ + pc.w4 * offset_input[pc.pos4].val_; + + pre_calc_index += 1; + } + } + output_val /= count; + + output[index] = at::native::requantize_from_int(1.f, 0, (int)output_val); // TODO: this is wrong need to set scale and zero etc. + } // for pw + } // for ph + } // for c + } // for n + +} + +at::Tensor qroi_align_forward_kernel( + const at::Tensor& input, + const at::Tensor& rois, + double spatial_scale, + int64_t pooled_height, + int64_t pooled_width, + int64_t sampling_ratio, + bool aligned) { + TORCH_CHECK(input.device().is_cpu(), "input must be a CPU tensor"); + TORCH_CHECK(rois.device().is_cpu(), "rois must be a CPU tensor"); + TORCH_CHECK(rois.size(1) == 5, "rois must have shape as Tensor[K, 5]"); + + at::TensorArg input_t{input, "input", 1}, rois_t{rois, "rois", 2}; + + at::CheckedFrom c = "qroi_align_forward_kernel"; + at::checkAllSameType(c, {input_t, rois_t}); + + auto num_rois = rois.size(0); + auto channels = input.size(1); + auto height = input.size(2); + auto width = input.size(3); + + // TODO: This should really be initialized to zero, not empty + at::Tensor output = at::_empty_affine_quantized( + {num_rois, channels, pooled_height, pooled_width}, input.options()); + + if (output.numel() == 0) + return output; + + auto input_ = input.contiguous(), rois_ = rois.contiguous(); + AT_DISPATCH_QINT_TYPES( + input.scalar_type(), "qroi_align_forward_kernel", [&] { + qroi_align_forward_kernel_impl( + num_rois, + input_.data_ptr(), + spatial_scale, + channels, + height, + width, + pooled_height, + pooled_width, + sampling_ratio, + aligned, + rois_.data_ptr(), + output.data_ptr(), + output.numel()); + }); + return output; +} + +} // namespace + +TORCH_LIBRARY_IMPL(torchvision, QuantizedCPU, m) { + m.impl( + TORCH_SELECTIVE_NAME("torchvision::roi_align"), + TORCH_FN(qroi_align_forward_kernel)); +} + +} // namespace ops +} // namespace vision \ No newline at end of file From 8d21449f8c4b56c3c394d1edc41cf7b37b0a4240 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Thu, 1 Apr 2021 09:40:16 +0100 Subject: [PATCH 02/26] clang --- torchvision/csrc/ops/quantized/cpu/qroi_align_kernel.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torchvision/csrc/ops/quantized/cpu/qroi_align_kernel.cpp b/torchvision/csrc/ops/quantized/cpu/qroi_align_kernel.cpp index 2926259a70a..e9f0dc9546d 100644 --- a/torchvision/csrc/ops/quantized/cpu/qroi_align_kernel.cpp +++ b/torchvision/csrc/ops/quantized/cpu/qroi_align_kernel.cpp @@ -283,4 +283,4 @@ TORCH_LIBRARY_IMPL(torchvision, QuantizedCPU, m) { } } // namespace ops -} // namespace vision \ No newline at end of file +} // namespace vision From 68b0dd8f11dcc87d8f034e645a528893db8762ff Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Fri, 2 Apr 2021 22:34:58 +0100 Subject: [PATCH 03/26] docs --- torchvision/ops/roi_align.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/torchvision/ops/roi_align.py b/torchvision/ops/roi_align.py index 0f6c0be1729..c062c6f91f3 100644 --- a/torchvision/ops/roi_align.py +++ b/torchvision/ops/roi_align.py @@ -17,7 +17,7 @@ def roi_align( aligned: bool = False, ) -> Tensor: """ - Performs Region of Interest (RoI) Align operator described in Mask R-CNN + Performs Region of Interest (RoI) Align operator with average pooling, as described in Mask R-CNN. Args: input (Tensor[N, C, H, W]): input tensor @@ -27,19 +27,19 @@ def roi_align( If a single Tensor is passed, then the first column should contain the batch index. If a list of Tensors is passed, then each Tensor will correspond to the boxes for an element i - in a batch - output_size (int or Tuple[int, int]): the size of the output after the cropping - is performed, as (height, width) + in a batch. + output_size (int or Tuple[int, int]): the size of the output after the pooling + is performed, as (height, width). spatial_scale (float): a scaling factor that maps the input coordinates to the box coordinates. Default: 1.0 sampling_ratio (int): number of sampling points in the interpolation grid used to compute the output value of each pooled output bin. If > 0, - then exactly sampling_ratio x sampling_ratio grid points are used. If + then exactly ``sampling_ratio x sampling_ratio`` sampling points per bin are used. If <= 0, then an adaptive number of grid points are used (computed as - ceil(roi_width / pooled_w), and likewise for height). Default: -1 + ``ceil(roi_width / output_width)``, and likewise for height). Default: -1 aligned (bool): If False, use the legacy implementation. If True, pixel shift it by -0.5 for align more perfectly about two neighboring pixel indices. - This version in Detectron2 + This version is used in Detectron2 Returns: output (Tensor[K, C, output_size[0], output_size[1]]) From c115b7385bc4bec600c149fa09a5015dbf2d0201 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Fri, 2 Apr 2021 22:35:14 +0100 Subject: [PATCH 04/26] extracted out common utils --- torchvision/csrc/ops/cpu/roi_align_kernel.cpp | 123 +-------------- .../ops/quantized/cpu/qroi_align_kernel.cpp | 143 ++---------------- torchvision/csrc/ops/roi_align.h | 107 +++++++++++++ 3 files changed, 127 insertions(+), 246 deletions(-) diff --git a/torchvision/csrc/ops/cpu/roi_align_kernel.cpp b/torchvision/csrc/ops/cpu/roi_align_kernel.cpp index dc0c38cd314..a70e5d5d630 100644 --- a/torchvision/csrc/ops/cpu/roi_align_kernel.cpp +++ b/torchvision/csrc/ops/cpu/roi_align_kernel.cpp @@ -1,120 +1,13 @@ #include #include +#include "../roi_align.h" + namespace vision { namespace ops { namespace { -// implementation taken from Caffe2 -template -struct PreCalc { - int pos1; - int pos2; - int pos3; - int pos4; - T w1; - T w2; - T w3; - T w4; -}; - -template -void pre_calc_for_bilinear_interpolate( - int height, - int width, - int pooled_height, - int pooled_width, - int iy_upper, - int ix_upper, - T roi_start_h, - T roi_start_w, - T bin_size_h, - T bin_size_w, - int roi_bin_grid_h, - int roi_bin_grid_w, - std::vector>& pre_calc) { - int pre_calc_index = 0; - for (int ph = 0; ph < pooled_height; ph++) { - for (int pw = 0; pw < pooled_width; pw++) { - for (int iy = 0; iy < iy_upper; iy++) { - const T yy = roi_start_h + ph * bin_size_h + - static_cast(iy + .5f) * bin_size_h / - static_cast(roi_bin_grid_h); // e.g., 0.5, 1.5 - for (int ix = 0; ix < ix_upper; ix++) { - const T xx = roi_start_w + pw * bin_size_w + - static_cast(ix + .5f) * bin_size_w / - static_cast(roi_bin_grid_w); - - T x = xx; - T y = yy; - // deal with: inverse elements are out of feature map boundary - if (y < -1.0 || y > height || x < -1.0 || x > width) { - // empty - PreCalc pc; - pc.pos1 = 0; - pc.pos2 = 0; - pc.pos3 = 0; - pc.pos4 = 0; - pc.w1 = 0; - pc.w2 = 0; - pc.w3 = 0; - pc.w4 = 0; - pre_calc[pre_calc_index] = pc; - pre_calc_index += 1; - continue; - } - - if (y <= 0) { - y = 0; - } - if (x <= 0) { - x = 0; - } - - int y_low = (int)y; - int x_low = (int)x; - int y_high; - int x_high; - - if (y_low >= height - 1) { - y_high = y_low = height - 1; - y = (T)y_low; - } else { - y_high = y_low + 1; - } - - if (x_low >= width - 1) { - x_high = x_low = width - 1; - x = (T)x_low; - } else { - x_high = x_low + 1; - } - - T ly = y - y_low; - T lx = x - x_low; - T hy = 1. - ly, hx = 1. - lx; - T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx; - - // save weights and indeces - PreCalc pc; - pc.pos1 = y_low * width + x_low; - pc.pos2 = y_low * width + x_high; - pc.pos3 = y_high * width + x_low; - pc.pos4 = y_high * width + x_high; - pc.w1 = w1; - pc.w2 = w2; - pc.w3 = w3; - pc.w4 = w4; - pre_calc[pre_calc_index] = pc; - - pre_calc_index += 1; - } - } - } - } -} - template void roi_align_forward_kernel_impl( int n_rois, @@ -167,17 +60,15 @@ void roi_align_forward_kernel_impl( // When the grid is empty, output zeros. const T count = std::max(roi_bin_grid_h * roi_bin_grid_w, 1); // e.g. = 4 - // we want to precalculate indeces and weights shared by all chanels, - // this is the key point of optimiation - std::vector> pre_calc( + // we want to precalculate indices and weights shared by all chanels, + // this is the key point of optimization + std::vector> pre_calc( roi_bin_grid_h * roi_bin_grid_w * pooled_width * pooled_height); - pre_calc_for_bilinear_interpolate( + detail::pre_calc_for_bilinear_interpolate( height, width, pooled_height, pooled_width, - roi_bin_grid_h, - roi_bin_grid_w, roi_start_h, roi_start_w, bin_size_h, @@ -199,7 +90,7 @@ void roi_align_forward_kernel_impl( T output_val = 0.; for (int iy = 0; iy < roi_bin_grid_h; iy++) { for (int ix = 0; ix < roi_bin_grid_w; ix++) { - PreCalc pc = pre_calc[pre_calc_index]; + detail::PreCalc pc = pre_calc[pre_calc_index]; output_val += pc.w1 * offset_input[pc.pos1] + pc.w2 * offset_input[pc.pos2] + pc.w3 * offset_input[pc.pos3] + pc.w4 * offset_input[pc.pos4]; diff --git a/torchvision/csrc/ops/quantized/cpu/qroi_align_kernel.cpp b/torchvision/csrc/ops/quantized/cpu/qroi_align_kernel.cpp index e9f0dc9546d..55086ae4ec4 100644 --- a/torchvision/csrc/ops/quantized/cpu/qroi_align_kernel.cpp +++ b/torchvision/csrc/ops/quantized/cpu/qroi_align_kernel.cpp @@ -4,125 +4,18 @@ #include #include +#include "../../roi_align.h" + namespace vision { namespace ops { namespace { -template -struct PreCalc { - int pos1; - int pos2; - int pos3; - int pos4; - T w1; - T w2; - T w3; - T w4; -}; - -template -void pre_calc_for_bilinear_interpolate( - int height, - int width, - int pooled_height, - int pooled_width, - int iy_upper, - int ix_upper, - T roi_start_h, - T roi_start_w, - T bin_size_h, - T bin_size_w, - int roi_bin_grid_h, - int roi_bin_grid_w, - std::vector>& pre_calc) { - int pre_calc_index = 0; - for (int ph = 0; ph < pooled_height; ph++) { - for (int pw = 0; pw < pooled_width; pw++) { - for (int iy = 0; iy < iy_upper; iy++) { - const T yy = roi_start_h + ph * bin_size_h + - static_cast(iy + .5f) * bin_size_h / - static_cast(roi_bin_grid_h); // e.g., 0.5, 1.5 - for (int ix = 0; ix < ix_upper; ix++) { - const T xx = roi_start_w + pw * bin_size_w + - static_cast(ix + .5f) * bin_size_w / - static_cast(roi_bin_grid_w); - - T x = xx; - T y = yy; - // deal with: inverse elements are out of feature map boundary - if (y < -1.0 || y > height || x < -1.0 || x > width) { - // empty - PreCalc pc; - pc.pos1 = 0; - pc.pos2 = 0; - pc.pos3 = 0; - pc.pos4 = 0; - pc.w1 = 0; - pc.w2 = 0; - pc.w3 = 0; - pc.w4 = 0; - pre_calc[pre_calc_index] = pc; - pre_calc_index += 1; - continue; - } - - if (y <= 0) { - y = 0; - } - if (x <= 0) { - x = 0; - } - - int y_low = (int)y; - int x_low = (int)x; - int y_high; - int x_high; - - if (y_low >= height - 1) { - y_high = y_low = height - 1; - y = (T)y_low; - } else { - y_high = y_low + 1; - } - - if (x_low >= width - 1) { - x_high = x_low = width - 1; - x = (T)x_low; - } else { - x_high = x_low + 1; - } - - T ly = y - y_low; - T lx = x - x_low; - T hy = 1. - ly, hx = 1. - lx; - T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx; - - // save weights and indeces - PreCalc pc; - pc.pos1 = y_low * width + x_low; - pc.pos2 = y_low * width + x_high; - pc.pos3 = y_high * width + x_low; - pc.pos4 = y_high * width + x_high; - pc.w1 = w1; - pc.w2 = w2; - pc.w3 = w3; - pc.w4 = w4; - pre_calc[pre_calc_index] = pc; - - pre_calc_index += 1; - } - } - } - } -} - - template void qroi_align_forward_kernel_impl( int n_rois, const T* input, - double & spatial_scale, + const float & spatial_scale, int channels, int height, int width, @@ -131,22 +24,16 @@ void qroi_align_forward_kernel_impl( int sampling_ratio, bool aligned, const T* rois, - T* output, - int64_t output_size) { - - for (int64_t i = 0; i < output_size; i++) { - output[i].val_ = 0; - } + T* output){ for (int n = 0; n < n_rois; n++) { int index_n = n * channels * pooled_width * pooled_height; const T* offset_rois = rois + n * 5; - int roi_batch_ind = offset_rois[0].val_; + int roi_batch_ind = offset_rois[0].val_; // FIXME: This can be out of the range of the quantized type!! // Do not using rounding; this implementation detail is critical - // T offset = aligned ? (T)0.5 : (T)0.0; - int offset = 0; // TODO fix this + float offset = aligned ? 0.5 : 0.; float roi_start_w = offset_rois[1].val_ * spatial_scale - offset; float roi_start_h = offset_rois[2].val_ * spatial_scale - offset; float roi_end_w = offset_rois[3].val_ * spatial_scale - offset; @@ -172,19 +59,17 @@ void qroi_align_forward_kernel_impl( // We do average (integral) pooling inside a bin // When the grid is empty, output zeros. - const int count = std::max(roi_bin_grid_h * roi_bin_grid_w, 1); // e.g. = 4 + const float count = std::max(roi_bin_grid_h * roi_bin_grid_w, 1); // e.g. = 4 - // we want to precalculate indeces and weights shared by all chanels, - // this is the key point of optimiation - std::vector> pre_calc( + // we want to precalculate indices and weights shared by all chanels, + // this is the key point of optimization + std::vector> pre_calc( roi_bin_grid_h * roi_bin_grid_w * pooled_width * pooled_height); - pre_calc_for_bilinear_interpolate( + detail::pre_calc_for_bilinear_interpolate( height, width, pooled_height, pooled_width, - roi_bin_grid_h, - roi_bin_grid_w, roi_start_h, roi_start_w, bin_size_h, @@ -206,7 +91,7 @@ void qroi_align_forward_kernel_impl( float output_val = 0.; for (int iy = 0; iy < roi_bin_grid_h; iy++) { for (int ix = 0; ix < roi_bin_grid_w; ix++) { - PreCalc pc = pre_calc[pre_calc_index]; + detail::PreCalc pc = pre_calc[pre_calc_index]; output_val += pc.w1 * offset_input[pc.pos1].val_ + pc.w2 * offset_input[pc.pos2].val_ + pc.w3 * offset_input[pc.pos3].val_ + pc.w4 * offset_input[pc.pos4].val_; @@ -246,7 +131,6 @@ at::Tensor qroi_align_forward_kernel( auto height = input.size(2); auto width = input.size(3); - // TODO: This should really be initialized to zero, not empty at::Tensor output = at::_empty_affine_quantized( {num_rois, channels, pooled_height, pooled_width}, input.options()); @@ -268,8 +152,7 @@ at::Tensor qroi_align_forward_kernel( sampling_ratio, aligned, rois_.data_ptr(), - output.data_ptr(), - output.numel()); + output.data_ptr()); }); return output; } diff --git a/torchvision/csrc/ops/roi_align.h b/torchvision/csrc/ops/roi_align.h index 2ddb6ac3945..ff3b3596120 100644 --- a/torchvision/csrc/ops/roi_align.h +++ b/torchvision/csrc/ops/roi_align.h @@ -30,6 +30,113 @@ at::Tensor _roi_align_backward( int64_t sampling_ratio, bool aligned); +template +struct PreCalc { + int pos1; + int pos2; + int pos3; + int pos4; + T w1; + T w2; + T w3; + T w4; +}; + +// implementation taken from Caffe2 +template +void pre_calc_for_bilinear_interpolate( + int height, + int width, + int pooled_height, + int pooled_width, + T roi_start_h, + T roi_start_w, + T bin_size_h, + T bin_size_w, + int roi_bin_grid_h, + int roi_bin_grid_w, + std::vector>& pre_calc) { + int pre_calc_index = 0; + for (int ph = 0; ph < pooled_height; ph++) { + for (int pw = 0; pw < pooled_width; pw++) { + for (int iy = 0; iy < roi_bin_grid_h; iy++) { + const T yy = roi_start_h + ph * bin_size_h + + static_cast(iy + .5f) * bin_size_h / + static_cast(roi_bin_grid_h); // e.g., 0.5, 1.5 + for (int ix = 0; ix < roi_bin_grid_w; ix++) { + const T xx = roi_start_w + pw * bin_size_w + + static_cast(ix + .5f) * bin_size_w / + static_cast(roi_bin_grid_w); + + T x = xx; + T y = yy; + // deal with: inverse elements are out of feature map boundary + if (y < -1.0 || y > height || x < -1.0 || x > width) { + // empty + PreCalc pc; + pc.pos1 = 0; + pc.pos2 = 0; + pc.pos3 = 0; + pc.pos4 = 0; + pc.w1 = 0; + pc.w2 = 0; + pc.w3 = 0; + pc.w4 = 0; + pre_calc[pre_calc_index] = pc; + pre_calc_index += 1; + continue; + } + + if (y <= 0) { + y = 0; + } + if (x <= 0) { + x = 0; + } + + int y_low = (int)y; + int x_low = (int)x; + int y_high; + int x_high; + + if (y_low >= height - 1) { + y_high = y_low = height - 1; + y = (T)y_low; + } else { + y_high = y_low + 1; + } + + if (x_low >= width - 1) { + x_high = x_low = width - 1; + x = (T)x_low; + } else { + x_high = x_low + 1; + } + + T ly = y - y_low; + T lx = x - x_low; + T hy = 1. - ly, hx = 1. - lx; + T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx; + + // save weights and indeces + PreCalc pc; + pc.pos1 = y_low * width + x_low; + pc.pos2 = y_low * width + x_high; + pc.pos3 = y_high * width + x_low; + pc.pos4 = y_high * width + x_high; + pc.w1 = w1; + pc.w2 = w2; + pc.w3 = w3; + pc.w4 = w4; + pre_calc[pre_calc_index] = pc; + + pre_calc_index += 1; + } + } + } + } +} + } // namespace detail } // namespace ops From aadd2fc210992fbf1e4f22b8ebf54de96c0ae897 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Sat, 3 Apr 2021 13:21:32 +0100 Subject: [PATCH 05/26] Use better quantization function and pass tensors as parameters --- .../ops/quantized/cpu/qroi_align_kernel.cpp | 23 +++++++++++-------- torchvision/csrc/ops/roi_align.h | 10 ++++++++ 2 files changed, 23 insertions(+), 10 deletions(-) diff --git a/torchvision/csrc/ops/quantized/cpu/qroi_align_kernel.cpp b/torchvision/csrc/ops/quantized/cpu/qroi_align_kernel.cpp index 55086ae4ec4..002ba408729 100644 --- a/torchvision/csrc/ops/quantized/cpu/qroi_align_kernel.cpp +++ b/torchvision/csrc/ops/quantized/cpu/qroi_align_kernel.cpp @@ -1,8 +1,6 @@ #include #include #include -#include -#include #include "../../roi_align.h" @@ -14,7 +12,7 @@ namespace { template void qroi_align_forward_kernel_impl( int n_rois, - const T* input, + const at::Tensor& t_input, const float & spatial_scale, int channels, int height, @@ -23,9 +21,12 @@ void qroi_align_forward_kernel_impl( int pooled_width, int sampling_ratio, bool aligned, - const T* rois, + const at::Tensor& t_rois, T* output){ - + + const T* input = t_input.contiguous().data_ptr(); + const T* rois = t_rois.contiguous().data_ptr(); + for (int n = 0; n < n_rois; n++) { int index_n = n * channels * pooled_width * pooled_height; @@ -101,7 +102,7 @@ void qroi_align_forward_kernel_impl( } output_val /= count; - output[index] = at::native::requantize_from_int(1.f, 0, (int)output_val); // TODO: this is wrong need to set scale and zero etc. + output[index] = at::native::quantize_val(1.f, 0, output_val); // TODO: this is wrong need to set scale and zero etc. } // for pw } // for ph } // for c @@ -131,18 +132,20 @@ at::Tensor qroi_align_forward_kernel( auto height = input.size(2); auto width = input.size(3); + // FIXME: This is private, API might change: + // https://github.com/pytorch/pytorch/wiki/Introducing-Quantized-Tensor#quantized-tensor-apis at::Tensor output = at::_empty_affine_quantized( - {num_rois, channels, pooled_height, pooled_width}, input.options()); + {num_rois, channels, pooled_height, pooled_width}, input.options(), + input.q_scale(), input.q_zero_point()); if (output.numel() == 0) return output; - auto input_ = input.contiguous(), rois_ = rois.contiguous(); AT_DISPATCH_QINT_TYPES( input.scalar_type(), "qroi_align_forward_kernel", [&] { qroi_align_forward_kernel_impl( num_rois, - input_.data_ptr(), + input, spatial_scale, channels, height, @@ -151,7 +154,7 @@ at::Tensor qroi_align_forward_kernel( pooled_width, sampling_ratio, aligned, - rois_.data_ptr(), + rois, output.data_ptr()); }); return output; diff --git a/torchvision/csrc/ops/roi_align.h b/torchvision/csrc/ops/roi_align.h index ff3b3596120..20c5684c87c 100644 --- a/torchvision/csrc/ops/roi_align.h +++ b/torchvision/csrc/ops/roi_align.h @@ -42,6 +42,16 @@ struct PreCalc { T w4; }; +// This helper computes the interpolation weights (w1, w2...) for every sampling +// point of a given box. There are pool_height * pool_width * roi_bin_grid_h * +// roi_bin_grid_w such sampling points. +// +// The weights (w1, w2...) are computed as the areas in this figure: +// https://en.wikipedia.org/wiki/Bilinear_interpolation#/media/File:Bilinear_interpolation_visualisation.svg +// and pos1, pos2 etc correspond to the indices of their respective pixels. +// +// Note: the weights and indices are shared across all channels, which is why +// they are pre-calculated prior to the main loop in the RoIAlign kernel. // implementation taken from Caffe2 template void pre_calc_for_bilinear_interpolate( From 81a320778b9606386a71e9f7d1179b43ce9a0f1b Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Sat, 3 Apr 2021 14:11:48 +0100 Subject: [PATCH 06/26] proper dequantization --- .../ops/quantized/cpu/qroi_align_kernel.cpp | 27 ++++++++++++------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/torchvision/csrc/ops/quantized/cpu/qroi_align_kernel.cpp b/torchvision/csrc/ops/quantized/cpu/qroi_align_kernel.cpp index 002ba408729..a2e71129f9b 100644 --- a/torchvision/csrc/ops/quantized/cpu/qroi_align_kernel.cpp +++ b/torchvision/csrc/ops/quantized/cpu/qroi_align_kernel.cpp @@ -25,20 +25,25 @@ void qroi_align_forward_kernel_impl( T* output){ const T* input = t_input.contiguous().data_ptr(); + int64_t input_zp = t_input.q_zero_point(); + float input_scale = t_input.q_scale(); + const T* rois = t_rois.contiguous().data_ptr(); + int64_t rois_zp = t_rois.q_zero_point(); + float rois_scale = t_rois.q_scale(); for (int n = 0; n < n_rois; n++) { int index_n = n * channels * pooled_width * pooled_height; const T* offset_rois = rois + n * 5; - int roi_batch_ind = offset_rois[0].val_; // FIXME: This can be out of the range of the quantized type!! + int roi_batch_ind = at::native::dequantize_val(rois_scale, rois_zp, offset_rois[0]); // FIXME: This can be out of the range of the quantized type!! // Do not using rounding; this implementation detail is critical float offset = aligned ? 0.5 : 0.; - float roi_start_w = offset_rois[1].val_ * spatial_scale - offset; - float roi_start_h = offset_rois[2].val_ * spatial_scale - offset; - float roi_end_w = offset_rois[3].val_ * spatial_scale - offset; - float roi_end_h = offset_rois[4].val_ * spatial_scale - offset; + float roi_start_w = at::native::dequantize_val(rois_scale, rois_zp, offset_rois[1]) * spatial_scale - offset; + float roi_start_h = at::native::dequantize_val(rois_scale, rois_zp, offset_rois[2]) * spatial_scale - offset; + float roi_end_w = at::native::dequantize_val(rois_scale, rois_zp, offset_rois[3]) * spatial_scale - offset; + float roi_end_h = at::native::dequantize_val(rois_scale, rois_zp, offset_rois[4]) * spatial_scale - offset; float roi_width = roi_end_w - roi_start_w; float roi_height = roi_end_h - roi_start_h; @@ -93,21 +98,23 @@ void qroi_align_forward_kernel_impl( for (int iy = 0; iy < roi_bin_grid_h; iy++) { for (int ix = 0; ix < roi_bin_grid_w; ix++) { detail::PreCalc pc = pre_calc[pre_calc_index]; - output_val += pc.w1 * offset_input[pc.pos1].val_ + - pc.w2 * offset_input[pc.pos2].val_ + - pc.w3 * offset_input[pc.pos3].val_ + pc.w4 * offset_input[pc.pos4].val_; + + output_val += // TODO: We can probably optimize the dequantization + pc.w1 * at::native::dequantize_val(input_scale, input_zp, offset_input[pc.pos1]) + + pc.w2 * at::native::dequantize_val(input_scale, input_zp, offset_input[pc.pos2]) + + pc.w3 * at::native::dequantize_val(input_scale, input_zp, offset_input[pc.pos3]) + + pc.w4 * at::native::dequantize_val(input_scale, input_zp, offset_input[pc.pos4]); pre_calc_index += 1; } } output_val /= count; - output[index] = at::native::quantize_val(1.f, 0, output_val); // TODO: this is wrong need to set scale and zero etc. + output[index] = at::native::quantize_val(input_scale, input_zp, output_val); } // for pw } // for ph } // for c } // for n - } at::Tensor qroi_align_forward_kernel( From 295a6ccf9c87c2062cfa35c9f7e08ef9c8e9dfc6 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Sat, 3 Apr 2021 17:19:38 +0100 Subject: [PATCH 07/26] Some tests --- test/test_ops.py | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/test/test_ops.py b/test/test_ops.py index 0031da45cce..fab324b2f3d 100644 --- a/test/test_ops.py +++ b/test/test_ops.py @@ -299,6 +299,36 @@ def _test_forward(self, device, contiguous, x_dtype=None, rois_dtype=None, **kwa for aligned in (True, False): super()._test_forward(device, contiguous, x_dtype, rois_dtype, aligned=aligned) + def test_qroialign(self): + """Make sure quantized version of RoIAlign is close to float version""" + pool_size = 5 + img_size = 10 + n_channels = 2 + num_batches = 2 + dtype = torch.float + + def make_rois(num_rois=1000): + rois = torch.randint(0, img_size // 2, size=(num_rois, 5)).to(dtype) + rois[:, 0] = torch.randint(0, num_batches, size=(num_rois,)) # set batch index + rois[:, 3:] += rois[:, 1:3] # make sure boxes aren't degenerate + return rois + + for scale, zero_point in ((1, 0), (2, 10)): + for qdtype in (torch.qint8, torch.quint8, torch.qint32): + + x = torch.randint(0, 100, size=(num_batches, n_channels, img_size, img_size)).to(dtype) + qx = torch.quantize_per_tensor(x, scale=scale, zero_point=zero_point, dtype=qdtype) + + rois = make_rois() + qrois = torch.quantize_per_tensor(rois, scale=scale, zero_point=zero_point, dtype=qdtype) + + x, rois = qx.dequantize(), qrois.dequantize() + + y = ops.roi_align(x, rois, output_size=pool_size, spatial_scale=1, sampling_ratio=-1) + qy = ops.roi_align(qx, qrois, output_size=pool_size, spatial_scale=1, sampling_ratio=-1) + + self.assertTrue(torch.allclose(y, qy.dequantize(), atol=1)) + class PSRoIAlignTester(RoIOpTester, unittest.TestCase): def fn(self, x, rois, pool_h, pool_w, spatial_scale=1, sampling_ratio=-1, **kwargs): From 626f7900435c21559558562d332754533c2330ef Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Sat, 3 Apr 2021 17:39:36 +0100 Subject: [PATCH 08/26] Dequantization optimization, seems to gain a few ms --- torchvision/csrc/ops/cpu/roi_align_kernel.cpp | 2 +- .../ops/quantized/cpu/qroi_align_kernel.cpp | 17 +++++++++++------ 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/torchvision/csrc/ops/cpu/roi_align_kernel.cpp b/torchvision/csrc/ops/cpu/roi_align_kernel.cpp index a70e5d5d630..619064cb40e 100644 --- a/torchvision/csrc/ops/cpu/roi_align_kernel.cpp +++ b/torchvision/csrc/ops/cpu/roi_align_kernel.cpp @@ -98,7 +98,7 @@ void roi_align_forward_kernel_impl( pre_calc_index += 1; } } - output_val /= count; + output_val /= count; // Average pooling output[index] = output_val; } // for pw diff --git a/torchvision/csrc/ops/quantized/cpu/qroi_align_kernel.cpp b/torchvision/csrc/ops/quantized/cpu/qroi_align_kernel.cpp index a2e71129f9b..0c6b8e7f925 100644 --- a/torchvision/csrc/ops/quantized/cpu/qroi_align_kernel.cpp +++ b/torchvision/csrc/ops/quantized/cpu/qroi_align_kernel.cpp @@ -95,20 +95,25 @@ void qroi_align_forward_kernel_impl( int index = index_n_c + ph * pooled_width + pw; float output_val = 0.; + float sum_w = 0.; for (int iy = 0; iy < roi_bin_grid_h; iy++) { for (int ix = 0; ix < roi_bin_grid_w; ix++) { detail::PreCalc pc = pre_calc[pre_calc_index]; - output_val += // TODO: We can probably optimize the dequantization - pc.w1 * at::native::dequantize_val(input_scale, input_zp, offset_input[pc.pos1]) + - pc.w2 * at::native::dequantize_val(input_scale, input_zp, offset_input[pc.pos2]) + - pc.w3 * at::native::dequantize_val(input_scale, input_zp, offset_input[pc.pos3]) + - pc.w4 * at::native::dequantize_val(input_scale, input_zp, offset_input[pc.pos4]); + // to optimize computations we use the raw .val_ fields and we'll dequantize later + output_val += + pc.w1 * offset_input[pc.pos1].val_ + + pc.w2 * offset_input[pc.pos2].val_ + + pc.w3 * offset_input[pc.pos3].val_ + + pc.w4 * offset_input[pc.pos4].val_; + + sum_w += pc.w1 + pc.w2 + pc.w3 + pc.w4; pre_calc_index += 1; } } - output_val /= count; + output_val = input_scale * (output_val - input_zp * sum_w); // dequantization + output_val /= count; // Average pooling output[index] = at::native::quantize_val(input_scale, input_zp, output_val); } // for pw From b1b68f13983e589aa072dbddfda863ce06fabac7 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Sat, 3 Apr 2021 17:44:50 +0100 Subject: [PATCH 09/26] clang-format --- .../ops/quantized/cpu/qroi_align_kernel.cpp | 85 +++++++++++-------- 1 file changed, 51 insertions(+), 34 deletions(-) diff --git a/torchvision/csrc/ops/quantized/cpu/qroi_align_kernel.cpp b/torchvision/csrc/ops/quantized/cpu/qroi_align_kernel.cpp index 0c6b8e7f925..81c14f625d2 100644 --- a/torchvision/csrc/ops/quantized/cpu/qroi_align_kernel.cpp +++ b/torchvision/csrc/ops/quantized/cpu/qroi_align_kernel.cpp @@ -1,6 +1,6 @@ #include -#include #include +#include #include "../../roi_align.h" @@ -13,7 +13,7 @@ template void qroi_align_forward_kernel_impl( int n_rois, const at::Tensor& t_input, - const float & spatial_scale, + const float& spatial_scale, int channels, int height, int width, @@ -22,8 +22,7 @@ void qroi_align_forward_kernel_impl( int sampling_ratio, bool aligned, const at::Tensor& t_rois, - T* output){ - + T* output) { const T* input = t_input.contiguous().data_ptr(); int64_t input_zp = t_input.q_zero_point(); float input_scale = t_input.q_scale(); @@ -31,19 +30,33 @@ void qroi_align_forward_kernel_impl( const T* rois = t_rois.contiguous().data_ptr(); int64_t rois_zp = t_rois.q_zero_point(); float rois_scale = t_rois.q_scale(); - + for (int n = 0; n < n_rois; n++) { int index_n = n * channels * pooled_width * pooled_height; const T* offset_rois = rois + n * 5; - int roi_batch_ind = at::native::dequantize_val(rois_scale, rois_zp, offset_rois[0]); // FIXME: This can be out of the range of the quantized type!! + int roi_batch_ind = at::native::dequantize_val( + rois_scale, rois_zp, offset_rois[0]); // FIXME: This can be out of the + // range of the quantized type!! // Do not using rounding; this implementation detail is critical float offset = aligned ? 0.5 : 0.; - float roi_start_w = at::native::dequantize_val(rois_scale, rois_zp, offset_rois[1]) * spatial_scale - offset; - float roi_start_h = at::native::dequantize_val(rois_scale, rois_zp, offset_rois[2]) * spatial_scale - offset; - float roi_end_w = at::native::dequantize_val(rois_scale, rois_zp, offset_rois[3]) * spatial_scale - offset; - float roi_end_h = at::native::dequantize_val(rois_scale, rois_zp, offset_rois[4]) * spatial_scale - offset; + float roi_start_w = + at::native::dequantize_val(rois_scale, rois_zp, offset_rois[1]) * + spatial_scale - + offset; + float roi_start_h = + at::native::dequantize_val(rois_scale, rois_zp, offset_rois[2]) * + spatial_scale - + offset; + float roi_end_w = + at::native::dequantize_val(rois_scale, rois_zp, offset_rois[3]) * + spatial_scale - + offset; + float roi_end_h = + at::native::dequantize_val(rois_scale, rois_zp, offset_rois[4]) * + spatial_scale - + offset; float roi_width = roi_end_w - roi_start_w; float roi_height = roi_end_h - roi_start_h; @@ -65,7 +78,8 @@ void qroi_align_forward_kernel_impl( // We do average (integral) pooling inside a bin // When the grid is empty, output zeros. - const float count = std::max(roi_bin_grid_h * roi_bin_grid_w, 1); // e.g. = 4 + const float count = + std::max(roi_bin_grid_h * roi_bin_grid_w, 1); // e.g. = 4 // we want to precalculate indices and weights shared by all chanels, // this is the key point of optimization @@ -100,9 +114,9 @@ void qroi_align_forward_kernel_impl( for (int ix = 0; ix < roi_bin_grid_w; ix++) { detail::PreCalc pc = pre_calc[pre_calc_index]; - // to optimize computations we use the raw .val_ fields and we'll dequantize later - output_val += - pc.w1 * offset_input[pc.pos1].val_ + + // to optimize computations we use the raw .val_ fields and we'll + // dequantize later + output_val += pc.w1 * offset_input[pc.pos1].val_ + pc.w2 * offset_input[pc.pos2].val_ + pc.w3 * offset_input[pc.pos3].val_ + pc.w4 * offset_input[pc.pos4].val_; @@ -112,10 +126,12 @@ void qroi_align_forward_kernel_impl( pre_calc_index += 1; } } - output_val = input_scale * (output_val - input_zp * sum_w); // dequantization + output_val = + input_scale * (output_val - input_zp * sum_w); // dequantization output_val /= count; // Average pooling - output[index] = at::native::quantize_val(input_scale, input_zp, output_val); + output[index] = + at::native::quantize_val(input_scale, input_zp, output_val); } // for pw } // for ph } // for c @@ -147,28 +163,29 @@ at::Tensor qroi_align_forward_kernel( // FIXME: This is private, API might change: // https://github.com/pytorch/pytorch/wiki/Introducing-Quantized-Tensor#quantized-tensor-apis at::Tensor output = at::_empty_affine_quantized( - {num_rois, channels, pooled_height, pooled_width}, input.options(), - input.q_scale(), input.q_zero_point()); + {num_rois, channels, pooled_height, pooled_width}, + input.options(), + input.q_scale(), + input.q_zero_point()); if (output.numel() == 0) return output; - AT_DISPATCH_QINT_TYPES( - input.scalar_type(), "qroi_align_forward_kernel", [&] { - qroi_align_forward_kernel_impl( - num_rois, - input, - spatial_scale, - channels, - height, - width, - pooled_height, - pooled_width, - sampling_ratio, - aligned, - rois, - output.data_ptr()); - }); + AT_DISPATCH_QINT_TYPES(input.scalar_type(), "qroi_align_forward_kernel", [&] { + qroi_align_forward_kernel_impl( + num_rois, + input, + spatial_scale, + channels, + height, + width, + pooled_height, + pooled_width, + sampling_ratio, + aligned, + rois, + output.data_ptr()); + }); return output; } From fb45472c9e0de9e97991b406a248257ca9d38cbd Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Sat, 3 Apr 2021 17:49:58 +0100 Subject: [PATCH 10/26] again --- torchvision/csrc/ops/roi_align.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torchvision/csrc/ops/roi_align.h b/torchvision/csrc/ops/roi_align.h index 20c5684c87c..61775e4559d 100644 --- a/torchvision/csrc/ops/roi_align.h +++ b/torchvision/csrc/ops/roi_align.h @@ -45,7 +45,7 @@ struct PreCalc { // This helper computes the interpolation weights (w1, w2...) for every sampling // point of a given box. There are pool_height * pool_width * roi_bin_grid_h * // roi_bin_grid_w such sampling points. -// +// // The weights (w1, w2...) are computed as the areas in this figure: // https://en.wikipedia.org/wiki/Bilinear_interpolation#/media/File:Bilinear_interpolation_visualisation.svg // and pos1, pos2 etc correspond to the indices of their respective pixels. From 79bdfdf010a48067a89a49fa0c5521b1fbd2aadb Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Sun, 4 Apr 2021 10:25:56 +0100 Subject: [PATCH 11/26] more correct test. Had to remove optimization although it almost works --- test/test_ops.py | 40 +++++++++++++++---- .../ops/quantized/cpu/qroi_align_kernel.cpp | 36 ++++++++++++----- 2 files changed, 57 insertions(+), 19 deletions(-) diff --git a/test/test_ops.py b/test/test_ops.py index fab324b2f3d..8d3c322083e 100644 --- a/test/test_ops.py +++ b/test/test_ops.py @@ -313,21 +313,45 @@ def make_rois(num_rois=1000): rois[:, 3:] += rois[:, 1:3] # make sure boxes aren't degenerate return rois - for scale, zero_point in ((1, 0), (2, 10)): + for scale, zero_point in ((1, 0), (2, 10), (0.1, 50)): for qdtype in (torch.qint8, torch.quint8, torch.qint32): - x = torch.randint(0, 100, size=(num_batches, n_channels, img_size, img_size)).to(dtype) + x = torch.randint(50, 100, size=(num_batches, n_channels, img_size, img_size)).to(dtype) qx = torch.quantize_per_tensor(x, scale=scale, zero_point=zero_point, dtype=qdtype) rois = make_rois() qrois = torch.quantize_per_tensor(rois, scale=scale, zero_point=zero_point, dtype=qdtype) - x, rois = qx.dequantize(), qrois.dequantize() - - y = ops.roi_align(x, rois, output_size=pool_size, spatial_scale=1, sampling_ratio=-1) - qy = ops.roi_align(qx, qrois, output_size=pool_size, spatial_scale=1, sampling_ratio=-1) - - self.assertTrue(torch.allclose(y, qy.dequantize(), atol=1)) + x, rois = qx.dequantize(), qrois.dequantize() # we want to pass the same inputs + + y = ops.roi_align( + x, + rois, + output_size=pool_size, + spatial_scale=1, + sampling_ratio=-1, + # aligned=aligned, + ) + qy = ops.roi_align( + qx, + qrois, + output_size=pool_size, + spatial_scale=1, + sampling_ratio=-1, + # aligned=aligned, + ) + + # The output qy is itself a quantized tensor and there might have been a loss of info when it was + # quantized. For a fair comparison we need to quantize y as well + quantized_float_y = torch.quantize_per_tensor(y, scale=scale, zero_point=zero_point, dtype=qdtype) + n_diff = (quantized_float_y != qy).sum() + diff = torch.abs((quantized_float_y.dequantize() - qy.dequantize())).sum() + self.assertTrue((qy == quantized_float_y).all(), f"{scale}, {zero_point}, {qdtype}, {n_diff}, {diff},") + + if (scale, zero_point) == (1, 0): + # in this case we can assert strict equality as the requantization of the output was the + # identity + self.assertTrue((qy.dequantize() == y.round()).all()) class PSRoIAlignTester(RoIOpTester, unittest.TestCase): diff --git a/torchvision/csrc/ops/quantized/cpu/qroi_align_kernel.cpp b/torchvision/csrc/ops/quantized/cpu/qroi_align_kernel.cpp index 81c14f625d2..10c62cf78b3 100644 --- a/torchvision/csrc/ops/quantized/cpu/qroi_align_kernel.cpp +++ b/torchvision/csrc/ops/quantized/cpu/qroi_align_kernel.cpp @@ -109,25 +109,39 @@ void qroi_align_forward_kernel_impl( int index = index_n_c + ph * pooled_width + pw; float output_val = 0.; - float sum_w = 0.; + // float sum_w = 0.; for (int iy = 0; iy < roi_bin_grid_h; iy++) { for (int ix = 0; ix < roi_bin_grid_w; ix++) { detail::PreCalc pc = pre_calc[pre_calc_index]; - // to optimize computations we use the raw .val_ fields and we'll - // dequantize later - output_val += pc.w1 * offset_input[pc.pos1].val_ + - pc.w2 * offset_input[pc.pos2].val_ + - pc.w3 * offset_input[pc.pos3].val_ + - pc.w4 * offset_input[pc.pos4].val_; - - sum_w += pc.w1 + pc.w2 + pc.w3 + pc.w4; + output_val += pc.w1 * + at::native::dequantize_val( + input_scale, input_zp, offset_input[pc.pos1]) + + pc.w2 * + at::native::dequantize_val( + input_scale, input_zp, offset_input[pc.pos2]) + + pc.w3 * + at::native::dequantize_val( + input_scale, input_zp, offset_input[pc.pos3]) + + pc.w4 * + at::native::dequantize_val( + input_scale, input_zp, offset_input[pc.pos4]); + + // FIXME: Possible optimization. Unfortunately the tests fail + // on some (few) inputs: Python rounds up while the C++ code + // rounds down (or the other way around). + // output_val += pc.w1 * offset_input[pc.pos1].val_ + + // pc.w2 * offset_input[pc.pos2].val_ + + // pc.w3 * offset_input[pc.pos3].val_ + + // pc.w4 * offset_input[pc.pos4].val_; + // sum_w += pc.w1 + pc.w2 + pc.w3 + pc.w4; + // And then dequantize later, just before averaging: + // output_val = input_scale * (output_val - (float)input_zp * + // sum_w); pre_calc_index += 1; } } - output_val = - input_scale * (output_val - input_zp * sum_w); // dequantization output_val /= count; // Average pooling output[index] = From 3dccacaaa85f07d39d4c267439ae1f56f11e4be7 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Sun, 4 Apr 2021 11:31:06 +0100 Subject: [PATCH 12/26] Also test aligned=True --- test/test_ops.py | 82 +++++++++++++++++++++++++----------------------- 1 file changed, 43 insertions(+), 39 deletions(-) diff --git a/test/test_ops.py b/test/test_ops.py index 8d3c322083e..6b3b085a81e 100644 --- a/test/test_ops.py +++ b/test/test_ops.py @@ -313,45 +313,49 @@ def make_rois(num_rois=1000): rois[:, 3:] += rois[:, 1:3] # make sure boxes aren't degenerate return rois - for scale, zero_point in ((1, 0), (2, 10), (0.1, 50)): - for qdtype in (torch.qint8, torch.quint8, torch.qint32): - - x = torch.randint(50, 100, size=(num_batches, n_channels, img_size, img_size)).to(dtype) - qx = torch.quantize_per_tensor(x, scale=scale, zero_point=zero_point, dtype=qdtype) - - rois = make_rois() - qrois = torch.quantize_per_tensor(rois, scale=scale, zero_point=zero_point, dtype=qdtype) - - x, rois = qx.dequantize(), qrois.dequantize() # we want to pass the same inputs - - y = ops.roi_align( - x, - rois, - output_size=pool_size, - spatial_scale=1, - sampling_ratio=-1, - # aligned=aligned, - ) - qy = ops.roi_align( - qx, - qrois, - output_size=pool_size, - spatial_scale=1, - sampling_ratio=-1, - # aligned=aligned, - ) - - # The output qy is itself a quantized tensor and there might have been a loss of info when it was - # quantized. For a fair comparison we need to quantize y as well - quantized_float_y = torch.quantize_per_tensor(y, scale=scale, zero_point=zero_point, dtype=qdtype) - n_diff = (quantized_float_y != qy).sum() - diff = torch.abs((quantized_float_y.dequantize() - qy.dequantize())).sum() - self.assertTrue((qy == quantized_float_y).all(), f"{scale}, {zero_point}, {qdtype}, {n_diff}, {diff},") - - if (scale, zero_point) == (1, 0): - # in this case we can assert strict equality as the requantization of the output was the - # identity - self.assertTrue((qy.dequantize() == y.round()).all()) + for aligned in (True, False): + for scale, zero_point in ((1, 0), (2, 10), (0.1, 50)): + for qdtype in (torch.qint8, torch.quint8, torch.qint32): + + x = torch.randint(50, 100, size=(num_batches, n_channels, img_size, img_size)).to(dtype) + qx = torch.quantize_per_tensor(x, scale=scale, zero_point=zero_point, dtype=qdtype) + + rois = make_rois() + qrois = torch.quantize_per_tensor(rois, scale=scale, zero_point=zero_point, dtype=qdtype) + + x, rois = qx.dequantize(), qrois.dequantize() # we want to pass the same inputs + + y = ops.roi_align( + x, + rois, + output_size=pool_size, + spatial_scale=1, + sampling_ratio=-1, + aligned=aligned, + ) + qy = ops.roi_align( + qx, + qrois, + output_size=pool_size, + spatial_scale=1, + sampling_ratio=-1, + aligned=aligned, + ) + + # The output qy is itself a quantized tensor and there might have been a loss of info when it was + # quantized. For a fair comparison we need to quantize y as well + quantized_float_y = torch.quantize_per_tensor(y, scale=scale, zero_point=zero_point, dtype=qdtype) + n_diff = (quantized_float_y != qy).sum() + diff = torch.abs((quantized_float_y.dequantize() - qy.dequantize())).sum() + self.assertTrue( + (qy == quantized_float_y).all(), + f"{scale}, {zero_point}, {qdtype}, {n_diff}, {diff},", + ) + + if (scale, zero_point) == (1, 0): + # in this case we can assert strict equality as the requantization of the output was the + # identity + self.assertTrue((qy.dequantize() == y.round()).all()) class PSRoIAlignTester(RoIOpTester, unittest.TestCase): From c0b13fd4cb668e79d12054e2ded9ad9d3c73ce68 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Sun, 4 Apr 2021 11:40:21 +0100 Subject: [PATCH 13/26] remove useless part --- test/test_ops.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/test/test_ops.py b/test/test_ops.py index 6b3b085a81e..8d7c1c972a6 100644 --- a/test/test_ops.py +++ b/test/test_ops.py @@ -352,11 +352,6 @@ def make_rois(num_rois=1000): f"{scale}, {zero_point}, {qdtype}, {n_diff}, {diff},", ) - if (scale, zero_point) == (1, 0): - # in this case we can assert strict equality as the requantization of the output was the - # identity - self.assertTrue((qy.dequantize() == y.round()).all()) - class PSRoIAlignTester(RoIOpTester, unittest.TestCase): def fn(self, x, rois, pool_h, pool_w, spatial_scale=1, sampling_ratio=-1, **kwargs): From 8527755c0aa307632aa66726c76c373362d8c298 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Sun, 4 Apr 2021 11:44:57 +0100 Subject: [PATCH 14/26] more docs and comments --- torchvision/csrc/ops/roi_align.h | 2 +- torchvision/ops/roi_align.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/torchvision/csrc/ops/roi_align.h b/torchvision/csrc/ops/roi_align.h index 61775e4559d..db14e896fe6 100644 --- a/torchvision/csrc/ops/roi_align.h +++ b/torchvision/csrc/ops/roi_align.h @@ -128,7 +128,7 @@ void pre_calc_for_bilinear_interpolate( T hy = 1. - ly, hx = 1. - lx; T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx; - // save weights and indeces + // save weights and indices PreCalc pc; pc.pos1 = y_low * width + x_low; pc.pos2 = y_low * width + x_high; diff --git a/torchvision/ops/roi_align.py b/torchvision/ops/roi_align.py index c062c6f91f3..0915aa07fa6 100644 --- a/torchvision/ops/roi_align.py +++ b/torchvision/ops/roi_align.py @@ -28,7 +28,7 @@ def roi_align( then the first column should contain the batch index. If a list of Tensors is passed, then each Tensor will correspond to the boxes for an element i in a batch. - output_size (int or Tuple[int, int]): the size of the output after the pooling + output_size (int or Tuple[int, int]): the size of the output (in bins or pixels) after the pooling is performed, as (height, width). spatial_scale (float): a scaling factor that maps the input coordinates to the box coordinates. Default: 1.0 @@ -38,8 +38,8 @@ def roi_align( <= 0, then an adaptive number of grid points are used (computed as ``ceil(roi_width / output_width)``, and likewise for height). Default: -1 aligned (bool): If False, use the legacy implementation. - If True, pixel shift it by -0.5 for align more perfectly about two neighboring pixel indices. - This version is used in Detectron2 + If True, pixel shift the box coordinates it by -0.5 for a better alignment with the two + neighboring pixel indices. This version is used in Detectron2 Returns: output (Tensor[K, C, output_size[0], output_size[1]]) From efef48af7868a9cf88c5aeb381ee32cc7ba8002e Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Mon, 5 Apr 2021 10:53:44 +0100 Subject: [PATCH 15/26] Put back optimization with more robust test --- test/test_ops.py | 24 +++++++++---- .../ops/quantized/cpu/qroi_align_kernel.cpp | 36 ++++++------------- 2 files changed, 29 insertions(+), 31 deletions(-) diff --git a/test/test_ops.py b/test/test_ops.py index 8d7c1c972a6..4a73fe8ee74 100644 --- a/test/test_ops.py +++ b/test/test_ops.py @@ -345,12 +345,24 @@ def make_rois(num_rois=1000): # The output qy is itself a quantized tensor and there might have been a loss of info when it was # quantized. For a fair comparison we need to quantize y as well quantized_float_y = torch.quantize_per_tensor(y, scale=scale, zero_point=zero_point, dtype=qdtype) - n_diff = (quantized_float_y != qy).sum() - diff = torch.abs((quantized_float_y.dequantize() - qy.dequantize())).sum() - self.assertTrue( - (qy == quantized_float_y).all(), - f"{scale}, {zero_point}, {qdtype}, {n_diff}, {diff},", - ) + + try: + # Ideally, we would assert this, which passes with (scale, zero) == (1, 0) + self.assertTrue((qy == quantized_float_y).all()) + except AssertionError: + # But because the computation aren't exactly the same between the 2 RoIAlign procedures, some + # rounding error may lead to a difference of 2 in the output. + # For example with (scale, zero) = (2, 10), 45.00000... will be quantized to 44 + # but 45.00000001 will be rounded to 46. We make sure below that: + # - such discrepancies between qy and quantized_float_y are very rare (less then 5%) + # - any difference between qy and quantized_float_y is == scale + diff_idx = torch.where(qy != quantized_float_y) + num_diff = diff_idx[0].numel() + self.assertTrue(num_diff / qy.numel() < .05) + + abs_diff = torch.abs(qy[diff_idx].dequantize() - quantized_float_y[diff_idx].dequantize()) + t_scale = torch.full_like(abs_diff, fill_value=scale) + self.assertTrue(torch.allclose(abs_diff, t_scale, atol=1e-5)) class PSRoIAlignTester(RoIOpTester, unittest.TestCase): diff --git a/torchvision/csrc/ops/quantized/cpu/qroi_align_kernel.cpp b/torchvision/csrc/ops/quantized/cpu/qroi_align_kernel.cpp index 10c62cf78b3..ad5a7f6166b 100644 --- a/torchvision/csrc/ops/quantized/cpu/qroi_align_kernel.cpp +++ b/torchvision/csrc/ops/quantized/cpu/qroi_align_kernel.cpp @@ -109,39 +109,25 @@ void qroi_align_forward_kernel_impl( int index = index_n_c + ph * pooled_width + pw; float output_val = 0.; - // float sum_w = 0.; + float sum_w = 0.; for (int iy = 0; iy < roi_bin_grid_h; iy++) { for (int ix = 0; ix < roi_bin_grid_w; ix++) { detail::PreCalc pc = pre_calc[pre_calc_index]; - output_val += pc.w1 * - at::native::dequantize_val( - input_scale, input_zp, offset_input[pc.pos1]) + - pc.w2 * - at::native::dequantize_val( - input_scale, input_zp, offset_input[pc.pos2]) + - pc.w3 * - at::native::dequantize_val( - input_scale, input_zp, offset_input[pc.pos3]) + - pc.w4 * - at::native::dequantize_val( - input_scale, input_zp, offset_input[pc.pos4]); - - // FIXME: Possible optimization. Unfortunately the tests fail - // on some (few) inputs: Python rounds up while the C++ code - // rounds down (or the other way around). - // output_val += pc.w1 * offset_input[pc.pos1].val_ + - // pc.w2 * offset_input[pc.pos2].val_ + - // pc.w3 * offset_input[pc.pos3].val_ + - // pc.w4 * offset_input[pc.pos4].val_; - // sum_w += pc.w1 + pc.w2 + pc.w3 + pc.w4; - // And then dequantize later, just before averaging: - // output_val = input_scale * (output_val - (float)input_zp * - // sum_w); + // Optimization: we use the raw values here and we'll dequantize + // later + output_val += pc.w1 * offset_input[pc.pos1].val_ + + pc.w2 * offset_input[pc.pos2].val_ + + pc.w3 * offset_input[pc.pos3].val_ + + pc.w4 * offset_input[pc.pos4].val_; + sum_w += pc.w1 + pc.w2 + pc.w3 + pc.w4; pre_calc_index += 1; } } + // Dequantize here + output_val = input_scale * (output_val - (float)input_zp * sum_w); + output_val /= count; // Average pooling output[index] = From d6f78ab0af9ef3f859733645191b5ebfb57dd1ba Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Wed, 7 Apr 2021 10:53:10 +0100 Subject: [PATCH 16/26] Added check for index upper bound --- test/test_ops.py | 13 ++++++++++--- .../csrc/ops/quantized/cpu/qroi_align_kernel.cpp | 13 +++++++++++-- 2 files changed, 21 insertions(+), 5 deletions(-) diff --git a/test/test_ops.py b/test/test_ops.py index 4a73fe8ee74..a0943d48687 100644 --- a/test/test_ops.py +++ b/test/test_ops.py @@ -304,12 +304,12 @@ def test_qroialign(self): pool_size = 5 img_size = 10 n_channels = 2 - num_batches = 2 + num_imgs = 2 dtype = torch.float def make_rois(num_rois=1000): rois = torch.randint(0, img_size // 2, size=(num_rois, 5)).to(dtype) - rois[:, 0] = torch.randint(0, num_batches, size=(num_rois,)) # set batch index + rois[:, 0] = torch.randint(0, num_imgs, size=(num_rois,)) # set batch index rois[:, 3:] += rois[:, 1:3] # make sure boxes aren't degenerate return rois @@ -317,7 +317,7 @@ def make_rois(num_rois=1000): for scale, zero_point in ((1, 0), (2, 10), (0.1, 50)): for qdtype in (torch.qint8, torch.quint8, torch.qint32): - x = torch.randint(50, 100, size=(num_batches, n_channels, img_size, img_size)).to(dtype) + x = torch.randint(50, 100, size=(num_imgs, n_channels, img_size, img_size)).to(dtype) qx = torch.quantize_per_tensor(x, scale=scale, zero_point=zero_point, dtype=qdtype) rois = make_rois() @@ -364,6 +364,13 @@ def make_rois(num_rois=1000): t_scale = torch.full_like(abs_diff, fill_value=scale) self.assertTrue(torch.allclose(abs_diff, t_scale, atol=1e-5)) + x = torch.randint(50, 100, size=(129, 3, 10, 10)).to(dtype) + qx = torch.quantize_per_tensor(x, scale=0, zero_point=1, dtype=torch.qint8) + rois = make_rois(10) + qrois = torch.quantize_per_tensor(rois, scale=0, zero_point=1, dtype=torch.qint8) + with self.assertRaisesRegex(RuntimeError, "There are 129 input images in the batch, but the RoIs tensor"): + ops.roi_align(qx, qrois, output_size=pool_size) + class PSRoIAlignTester(RoIOpTester, unittest.TestCase): def fn(self, x, rois, pool_h, pool_w, spatial_scale=1, sampling_ratio=-1, **kwargs): diff --git a/torchvision/csrc/ops/quantized/cpu/qroi_align_kernel.cpp b/torchvision/csrc/ops/quantized/cpu/qroi_align_kernel.cpp index ad5a7f6166b..7b88cbe2b9b 100644 --- a/torchvision/csrc/ops/quantized/cpu/qroi_align_kernel.cpp +++ b/torchvision/csrc/ops/quantized/cpu/qroi_align_kernel.cpp @@ -36,8 +36,7 @@ void qroi_align_forward_kernel_impl( const T* offset_rois = rois + n * 5; int roi_batch_ind = at::native::dequantize_val( - rois_scale, rois_zp, offset_rois[0]); // FIXME: This can be out of the - // range of the quantized type!! + rois_scale, rois_zp, offset_rois[0]); // Do not using rounding; this implementation detail is critical float offset = aligned ? 0.5 : 0.; @@ -172,6 +171,16 @@ at::Tensor qroi_align_forward_kernel( return output; AT_DISPATCH_QINT_TYPES(input.scalar_type(), "qroi_align_forward_kernel", [&] { + // Note: q_max relates to the input tensor, but we need that of the rois + // tensor. They're the same since we make sure rois and input have the same + // type above. + uint64_t max_indexable = std::numeric_limits::max() + 1; + std::string err_msg = "There are " + std::to_string(input.size(0)) + + " input images in the batch, but the RoIs tensor can only index up to " + + std::to_string(max_indexable) + + " images. Try to reduce the batch size."; + TORCH_CHECK(input.size(0) <= max_indexable, err_msg); + qroi_align_forward_kernel_impl( num_rois, input, From 61564ca8183f9406c6e019a151c731d232191a89 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Wed, 7 Apr 2021 11:31:46 +0100 Subject: [PATCH 17/26] avoid possible overflow --- torchvision/csrc/ops/quantized/cpu/qroi_align_kernel.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/torchvision/csrc/ops/quantized/cpu/qroi_align_kernel.cpp b/torchvision/csrc/ops/quantized/cpu/qroi_align_kernel.cpp index 7b88cbe2b9b..012c897cb6e 100644 --- a/torchvision/csrc/ops/quantized/cpu/qroi_align_kernel.cpp +++ b/torchvision/csrc/ops/quantized/cpu/qroi_align_kernel.cpp @@ -174,12 +174,12 @@ at::Tensor qroi_align_forward_kernel( // Note: q_max relates to the input tensor, but we need that of the rois // tensor. They're the same since we make sure rois and input have the same // type above. - uint64_t max_indexable = std::numeric_limits::max() + 1; + int64_t q_max = std::numeric_limits::max(); std::string err_msg = "There are " + std::to_string(input.size(0)) + " input images in the batch, but the RoIs tensor can only index up to " + - std::to_string(max_indexable) + + std::to_string(q_max + 1) + " images. Try to reduce the batch size."; - TORCH_CHECK(input.size(0) <= max_indexable, err_msg); + TORCH_CHECK(input.size(0) - 1 <= q_max, err_msg); qroi_align_forward_kernel_impl( num_rois, From 369fd33d9c62e6369acac96f2c0cd5eb6f79cc50 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Wed, 7 Apr 2021 12:00:54 +0100 Subject: [PATCH 18/26] Move common function into common.h --- torchvision/csrc/ops/cpu/roi_align_common.h | 118 ++++++++++++++++++ torchvision/csrc/ops/cpu/roi_align_kernel.cpp | 2 +- .../ops/quantized/cpu/qroi_align_kernel.cpp | 2 +- torchvision/csrc/ops/roi_align.h | 116 ----------------- 4 files changed, 120 insertions(+), 118 deletions(-) create mode 100644 torchvision/csrc/ops/cpu/roi_align_common.h diff --git a/torchvision/csrc/ops/cpu/roi_align_common.h b/torchvision/csrc/ops/cpu/roi_align_common.h new file mode 100644 index 00000000000..c34762f8222 --- /dev/null +++ b/torchvision/csrc/ops/cpu/roi_align_common.h @@ -0,0 +1,118 @@ +#pragma once + +#include + +namespace vision { +namespace ops { +namespace detail { + +template +struct PreCalc { + int pos1; + int pos2; + int pos3; + int pos4; + T w1; + T w2; + T w3; + T w4; +}; + +template +void pre_calc_for_bilinear_interpolate( + int height, + int width, + int pooled_height, + int pooled_width, + T roi_start_h, + T roi_start_w, + T bin_size_h, + T bin_size_w, + int roi_bin_grid_h, + int roi_bin_grid_w, + std::vector>& pre_calc); + int pre_calc_index = 0; + for (int ph = 0; ph < pooled_height; ph++) { + for (int pw = 0; pw < pooled_width; pw++) { + for (int iy = 0; iy < roi_bin_grid_h; iy++) { + const T yy = roi_start_h + ph * bin_size_h + + static_cast(iy + .5f) * bin_size_h / + static_cast(roi_bin_grid_h); // e.g., 0.5, 1.5 + for (int ix = 0; ix < roi_bin_grid_w; ix++) { + const T xx = roi_start_w + pw * bin_size_w + + static_cast(ix + .5f) * bin_size_w / + static_cast(roi_bin_grid_w); + + T x = xx; + T y = yy; + // deal with: inverse elements are out of feature map boundary + if (y < -1.0 || y > height || x < -1.0 || x > width) { + // empty + PreCalc pc; + pc.pos1 = 0; + pc.pos2 = 0; + pc.pos3 = 0; + pc.pos4 = 0; + pc.w1 = 0; + pc.w2 = 0; + pc.w3 = 0; + pc.w4 = 0; + pre_calc[pre_calc_index] = pc; + pre_calc_index += 1; + continue; + } + + if (y <= 0) { + y = 0; + } + if (x <= 0) { + x = 0; + } + + int y_low = (int)y; + int x_low = (int)x; + int y_high; + int x_high; + + if (y_low >= height - 1) { + y_high = y_low = height - 1; + y = (T)y_low; + } else { + y_high = y_low + 1; + } + + if (x_low >= width - 1) { + x_high = x_low = width - 1; + x = (T)x_low; + } else { + x_high = x_low + 1; + } + + T ly = y - y_low; + T lx = x - x_low; + T hy = 1. - ly, hx = 1. - lx; + T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx; + + // save weights and indices + PreCalc pc; + pc.pos1 = y_low * width + x_low; + pc.pos2 = y_low * width + x_high; + pc.pos3 = y_high * width + x_low; + pc.pos4 = y_high * width + x_high; + pc.w1 = w1; + pc.w2 = w2; + pc.w3 = w3; + pc.w4 = w4; + pre_calc[pre_calc_index] = pc; + + pre_calc_index += 1; + } + } + } + } +} + + +} // namespace detail +} // namespace ops +} // namespace vision \ No newline at end of file diff --git a/torchvision/csrc/ops/cpu/roi_align_kernel.cpp b/torchvision/csrc/ops/cpu/roi_align_kernel.cpp index 619064cb40e..e6684e953d0 100644 --- a/torchvision/csrc/ops/cpu/roi_align_kernel.cpp +++ b/torchvision/csrc/ops/cpu/roi_align_kernel.cpp @@ -1,7 +1,7 @@ #include #include -#include "../roi_align.h" +#include "./roi_align_common.h" namespace vision { namespace ops { diff --git a/torchvision/csrc/ops/quantized/cpu/qroi_align_kernel.cpp b/torchvision/csrc/ops/quantized/cpu/qroi_align_kernel.cpp index 012c897cb6e..6143948525e 100644 --- a/torchvision/csrc/ops/quantized/cpu/qroi_align_kernel.cpp +++ b/torchvision/csrc/ops/quantized/cpu/qroi_align_kernel.cpp @@ -2,7 +2,7 @@ #include #include -#include "../../roi_align.h" +#include "../../cpu/roi_align_common.h" namespace vision { namespace ops { diff --git a/torchvision/csrc/ops/roi_align.h b/torchvision/csrc/ops/roi_align.h index db14e896fe6..6a1cca35a3d 100644 --- a/torchvision/csrc/ops/roi_align.h +++ b/torchvision/csrc/ops/roi_align.h @@ -30,122 +30,6 @@ at::Tensor _roi_align_backward( int64_t sampling_ratio, bool aligned); -template -struct PreCalc { - int pos1; - int pos2; - int pos3; - int pos4; - T w1; - T w2; - T w3; - T w4; -}; - -// This helper computes the interpolation weights (w1, w2...) for every sampling -// point of a given box. There are pool_height * pool_width * roi_bin_grid_h * -// roi_bin_grid_w such sampling points. -// -// The weights (w1, w2...) are computed as the areas in this figure: -// https://en.wikipedia.org/wiki/Bilinear_interpolation#/media/File:Bilinear_interpolation_visualisation.svg -// and pos1, pos2 etc correspond to the indices of their respective pixels. -// -// Note: the weights and indices are shared across all channels, which is why -// they are pre-calculated prior to the main loop in the RoIAlign kernel. -// implementation taken from Caffe2 -template -void pre_calc_for_bilinear_interpolate( - int height, - int width, - int pooled_height, - int pooled_width, - T roi_start_h, - T roi_start_w, - T bin_size_h, - T bin_size_w, - int roi_bin_grid_h, - int roi_bin_grid_w, - std::vector>& pre_calc) { - int pre_calc_index = 0; - for (int ph = 0; ph < pooled_height; ph++) { - for (int pw = 0; pw < pooled_width; pw++) { - for (int iy = 0; iy < roi_bin_grid_h; iy++) { - const T yy = roi_start_h + ph * bin_size_h + - static_cast(iy + .5f) * bin_size_h / - static_cast(roi_bin_grid_h); // e.g., 0.5, 1.5 - for (int ix = 0; ix < roi_bin_grid_w; ix++) { - const T xx = roi_start_w + pw * bin_size_w + - static_cast(ix + .5f) * bin_size_w / - static_cast(roi_bin_grid_w); - - T x = xx; - T y = yy; - // deal with: inverse elements are out of feature map boundary - if (y < -1.0 || y > height || x < -1.0 || x > width) { - // empty - PreCalc pc; - pc.pos1 = 0; - pc.pos2 = 0; - pc.pos3 = 0; - pc.pos4 = 0; - pc.w1 = 0; - pc.w2 = 0; - pc.w3 = 0; - pc.w4 = 0; - pre_calc[pre_calc_index] = pc; - pre_calc_index += 1; - continue; - } - - if (y <= 0) { - y = 0; - } - if (x <= 0) { - x = 0; - } - - int y_low = (int)y; - int x_low = (int)x; - int y_high; - int x_high; - - if (y_low >= height - 1) { - y_high = y_low = height - 1; - y = (T)y_low; - } else { - y_high = y_low + 1; - } - - if (x_low >= width - 1) { - x_high = x_low = width - 1; - x = (T)x_low; - } else { - x_high = x_low + 1; - } - - T ly = y - y_low; - T lx = x - x_low; - T hy = 1. - ly, hx = 1. - lx; - T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx; - - // save weights and indices - PreCalc pc; - pc.pos1 = y_low * width + x_low; - pc.pos2 = y_low * width + x_high; - pc.pos3 = y_high * width + x_low; - pc.pos4 = y_high * width + x_high; - pc.w1 = w1; - pc.w2 = w2; - pc.w3 = w3; - pc.w4 = w4; - pre_calc[pre_calc_index] = pc; - - pre_calc_index += 1; - } - } - } - } -} } // namespace detail From bcadc0f3c27dd9a8b225d2f776a48c151827e265 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Wed, 7 Apr 2021 12:01:41 +0100 Subject: [PATCH 19/26] oops --- torchvision/csrc/ops/cpu/roi_align_common.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torchvision/csrc/ops/cpu/roi_align_common.h b/torchvision/csrc/ops/cpu/roi_align_common.h index c34762f8222..bc3031a58b6 100644 --- a/torchvision/csrc/ops/cpu/roi_align_common.h +++ b/torchvision/csrc/ops/cpu/roi_align_common.h @@ -30,7 +30,7 @@ void pre_calc_for_bilinear_interpolate( T bin_size_w, int roi_bin_grid_h, int roi_bin_grid_w, - std::vector>& pre_calc); + std::vector>& pre_calc){ int pre_calc_index = 0; for (int ph = 0; ph < pooled_height; ph++) { for (int pw = 0; pw < pooled_width; pw++) { From 6792e65955b59038c24785a07dfbc944d51539d7 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Wed, 7 Apr 2021 13:16:23 +0100 Subject: [PATCH 20/26] scale=1,zero_point=0 makes more sense --- test/test_ops.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/test_ops.py b/test/test_ops.py index a0943d48687..d50b6905632 100644 --- a/test/test_ops.py +++ b/test/test_ops.py @@ -365,9 +365,9 @@ def make_rois(num_rois=1000): self.assertTrue(torch.allclose(abs_diff, t_scale, atol=1e-5)) x = torch.randint(50, 100, size=(129, 3, 10, 10)).to(dtype) - qx = torch.quantize_per_tensor(x, scale=0, zero_point=1, dtype=torch.qint8) + qx = torch.quantize_per_tensor(x, scale=1, zero_point=0, dtype=torch.qint8) rois = make_rois(10) - qrois = torch.quantize_per_tensor(rois, scale=0, zero_point=1, dtype=torch.qint8) + qrois = torch.quantize_per_tensor(rois, scale=1, zero_point=0, dtype=torch.qint8) with self.assertRaisesRegex(RuntimeError, "There are 129 input images in the batch, but the RoIs tensor"): ops.roi_align(qx, qrois, output_size=pool_size) From dde14ed370fd5b192df4fa8c2b684bc90036bb6c Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Wed, 7 Apr 2021 15:01:57 +0100 Subject: [PATCH 21/26] Force batch size of 1 to prevent any indexingbug --- test/test_ops.py | 6 ++--- torchvision/csrc/ops/cpu/roi_align_common.h | 1 - .../ops/quantized/cpu/qroi_align_kernel.cpp | 22 +++++++++---------- 3 files changed, 13 insertions(+), 16 deletions(-) diff --git a/test/test_ops.py b/test/test_ops.py index d50b6905632..8c63c9c29c6 100644 --- a/test/test_ops.py +++ b/test/test_ops.py @@ -304,7 +304,7 @@ def test_qroialign(self): pool_size = 5 img_size = 10 n_channels = 2 - num_imgs = 2 + num_imgs = 1 dtype = torch.float def make_rois(num_rois=1000): @@ -364,11 +364,11 @@ def make_rois(num_rois=1000): t_scale = torch.full_like(abs_diff, fill_value=scale) self.assertTrue(torch.allclose(abs_diff, t_scale, atol=1e-5)) - x = torch.randint(50, 100, size=(129, 3, 10, 10)).to(dtype) + x = torch.randint(50, 100, size=(2, 3, 10, 10)).to(dtype) qx = torch.quantize_per_tensor(x, scale=1, zero_point=0, dtype=torch.qint8) rois = make_rois(10) qrois = torch.quantize_per_tensor(rois, scale=1, zero_point=0, dtype=torch.qint8) - with self.assertRaisesRegex(RuntimeError, "There are 129 input images in the batch, but the RoIs tensor"): + with self.assertRaisesRegex(RuntimeError, "Only one image per batch is allowed"): ops.roi_align(qx, qrois, output_size=pool_size) diff --git a/torchvision/csrc/ops/cpu/roi_align_common.h b/torchvision/csrc/ops/cpu/roi_align_common.h index bc3031a58b6..450db1f13f7 100644 --- a/torchvision/csrc/ops/cpu/roi_align_common.h +++ b/torchvision/csrc/ops/cpu/roi_align_common.h @@ -112,7 +112,6 @@ void pre_calc_for_bilinear_interpolate( } } - } // namespace detail } // namespace ops } // namespace vision \ No newline at end of file diff --git a/torchvision/csrc/ops/quantized/cpu/qroi_align_kernel.cpp b/torchvision/csrc/ops/quantized/cpu/qroi_align_kernel.cpp index 6143948525e..e34b277747e 100644 --- a/torchvision/csrc/ops/quantized/cpu/qroi_align_kernel.cpp +++ b/torchvision/csrc/ops/quantized/cpu/qroi_align_kernel.cpp @@ -35,8 +35,9 @@ void qroi_align_forward_kernel_impl( int index_n = n * channels * pooled_width * pooled_height; const T* offset_rois = rois + n * 5; - int roi_batch_ind = at::native::dequantize_val( - rois_scale, rois_zp, offset_rois[0]); + + // FIXME: change this when batches of size > 1 are allowed + const int roi_batch_ind = 0; // Do not using rounding; this implementation detail is critical float offset = aligned ? 0.5 : 0.; @@ -148,6 +149,13 @@ at::Tensor qroi_align_forward_kernel( TORCH_CHECK(input.device().is_cpu(), "input must be a CPU tensor"); TORCH_CHECK(rois.device().is_cpu(), "rois must be a CPU tensor"); TORCH_CHECK(rois.size(1) == 5, "rois must have shape as Tensor[K, 5]"); + // The first column of the RoI tensor is an image index, but not all indices + // are representable depending on the quantization. For example 1, 3, 5... + // indices can't be represented when qscale is 2. To prevent any bug, we force + // a batch size of 1 and we ignore the first column + TORCH_CHECK( + input.size(0) == 1, + "Only one image per batch is allowed in roi_align when quantized tensors are passed."); at::TensorArg input_t{input, "input", 1}, rois_t{rois, "rois", 2}; @@ -171,16 +179,6 @@ at::Tensor qroi_align_forward_kernel( return output; AT_DISPATCH_QINT_TYPES(input.scalar_type(), "qroi_align_forward_kernel", [&] { - // Note: q_max relates to the input tensor, but we need that of the rois - // tensor. They're the same since we make sure rois and input have the same - // type above. - int64_t q_max = std::numeric_limits::max(); - std::string err_msg = "There are " + std::to_string(input.size(0)) + - " input images in the batch, but the RoIs tensor can only index up to " + - std::to_string(q_max + 1) + - " images. Try to reduce the batch size."; - TORCH_CHECK(input.size(0) - 1 <= q_max, err_msg); - qroi_align_forward_kernel_impl( num_rois, input, From 457aab025bd77815602190a30e87e2443c14757c Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Wed, 7 Apr 2021 15:06:47 +0100 Subject: [PATCH 22/26] format --- torchvision/csrc/ops/cpu/roi_align_common.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torchvision/csrc/ops/cpu/roi_align_common.h b/torchvision/csrc/ops/cpu/roi_align_common.h index 450db1f13f7..e59595101ae 100644 --- a/torchvision/csrc/ops/cpu/roi_align_common.h +++ b/torchvision/csrc/ops/cpu/roi_align_common.h @@ -30,7 +30,7 @@ void pre_calc_for_bilinear_interpolate( T bin_size_w, int roi_bin_grid_h, int roi_bin_grid_w, - std::vector>& pre_calc){ + std::vector>& pre_calc) { int pre_calc_index = 0; for (int ph = 0; ph < pooled_height; ph++) { for (int pw = 0; pw < pooled_width; pw++) { From 0c7bb11feb7552f51629b589f97f33b6a9a68d6c Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Wed, 7 Apr 2021 15:43:44 +0100 Subject: [PATCH 23/26] format again --- torchvision/csrc/ops/roi_align.h | 1 - 1 file changed, 1 deletion(-) diff --git a/torchvision/csrc/ops/roi_align.h b/torchvision/csrc/ops/roi_align.h index 6a1cca35a3d..2ddb6ac3945 100644 --- a/torchvision/csrc/ops/roi_align.h +++ b/torchvision/csrc/ops/roi_align.h @@ -30,7 +30,6 @@ at::Tensor _roi_align_backward( int64_t sampling_ratio, bool aligned); - } // namespace detail } // namespace ops From e96cf1a029ea91852f0c1d2fe3de56445a7b0973 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Wed, 7 Apr 2021 15:48:06 +0100 Subject: [PATCH 24/26] updated docstring --- torchvision/ops/roi_align.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/torchvision/ops/roi_align.py b/torchvision/ops/roi_align.py index ae4c9dd3678..2e58646151f 100644 --- a/torchvision/ops/roi_align.py +++ b/torchvision/ops/roi_align.py @@ -20,14 +20,14 @@ def roi_align( Performs Region of Interest (RoI) Align operator with average pooling, as described in Mask R-CNN. Args: - input (Tensor[N, C, H, W]): input tensor + input (Tensor[N, C, H, W]): The input tensor, i.e. a batch with ``N`` feature maps boxes (Tensor[K, 5] or List[Tensor[L, 4]]): the box coordinates in (x1, y1, x2, y2) format where the regions will be taken from. The coordinate must satisfy ``0 <= x1 < x2`` and ``0 <= y1 < y2``. - If a single Tensor is passed, - then the first column should contain the batch index. If a list of Tensors - is passed, then each Tensor will correspond to the boxes for an element i - in a batch. + If a single Tensor is passed, then the first column should + contain the index of the corresponding feature map in the batch, i.e. a number in ``[0, N - 1]``. + If a list of Tensors is passed, then each Tensor will correspond to the boxes for an element i + in the batch. output_size (int or Tuple[int, int]): the size of the output (in bins or pixels) after the pooling is performed, as (height, width). spatial_scale (float): a scaling factor that maps the input coordinates to From 45d083fc1693b5cd56524a4ab46ed23be42698d6 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Thu, 8 Apr 2021 09:50:51 +0100 Subject: [PATCH 25/26] put back description comment for pre_calc_bilinear_interpolate --- torchvision/csrc/ops/cpu/roi_align_common.h | 13 ++++++++++++- torchvision/ops/roi_align.py | 3 ++- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/torchvision/csrc/ops/cpu/roi_align_common.h b/torchvision/csrc/ops/cpu/roi_align_common.h index e59595101ae..e10c67b5b79 100644 --- a/torchvision/csrc/ops/cpu/roi_align_common.h +++ b/torchvision/csrc/ops/cpu/roi_align_common.h @@ -18,6 +18,17 @@ struct PreCalc { T w4; }; +// This helper computes the interpolation weights (w1, w2...) for every sampling +// point of a given box. There are pool_height * pool_width * roi_bin_grid_h * +// roi_bin_grid_w such sampling points. +// +// The weights (w1, w2...) are computed as the areas in this figure: +// https://en.wikipedia.org/wiki/Bilinear_interpolation#/media/File:Bilinear_interpolation_visualisation.svg +// and pos1, pos2 etc correspond to the indices of their respective pixels. +// +// Note: the weights and indices are shared across all channels, which is why +// they are pre-calculated prior to the main loop in the RoIAlign kernel. +// implementation taken from Caffe2 template void pre_calc_for_bilinear_interpolate( int height, @@ -114,4 +125,4 @@ void pre_calc_for_bilinear_interpolate( } // namespace detail } // namespace ops -} // namespace vision \ No newline at end of file +} // namespace vision diff --git a/torchvision/ops/roi_align.py b/torchvision/ops/roi_align.py index 2e58646151f..b29101a448c 100644 --- a/torchvision/ops/roi_align.py +++ b/torchvision/ops/roi_align.py @@ -20,7 +20,8 @@ def roi_align( Performs Region of Interest (RoI) Align operator with average pooling, as described in Mask R-CNN. Args: - input (Tensor[N, C, H, W]): The input tensor, i.e. a batch with ``N`` feature maps + input (Tensor[N, C, H, W]): The input tensor, i.e. a batch with ``N`` feature maps. + If the tensor is quantized, we expect a batch size of ``N == 1``. boxes (Tensor[K, 5] or List[Tensor[L, 4]]): the box coordinates in (x1, y1, x2, y2) format where the regions will be taken from. The coordinate must satisfy ``0 <= x1 < x2`` and ``0 <= y1 < y2``. From 3ab6b6657977b9fbb14b7c26feea779ca88ebd8f Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Thu, 8 Apr 2021 09:59:37 +0100 Subject: [PATCH 26/26] revert most changes to docstring as it's taken care of in another PR --- torchvision/ops/roi_align.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/torchvision/ops/roi_align.py b/torchvision/ops/roi_align.py index b29101a448c..c0ac14329d4 100644 --- a/torchvision/ops/roi_align.py +++ b/torchvision/ops/roi_align.py @@ -17,30 +17,30 @@ def roi_align( aligned: bool = False, ) -> Tensor: """ - Performs Region of Interest (RoI) Align operator with average pooling, as described in Mask R-CNN. + Performs Region of Interest (RoI) Align operator described in Mask R-CNN Args: - input (Tensor[N, C, H, W]): The input tensor, i.e. a batch with ``N`` feature maps. + input (Tensor[N, C, H, W]): input tensor If the tensor is quantized, we expect a batch size of ``N == 1``. boxes (Tensor[K, 5] or List[Tensor[L, 4]]): the box coordinates in (x1, y1, x2, y2) format where the regions will be taken from. The coordinate must satisfy ``0 <= x1 < x2`` and ``0 <= y1 < y2``. - If a single Tensor is passed, then the first column should - contain the index of the corresponding feature map in the batch, i.e. a number in ``[0, N - 1]``. - If a list of Tensors is passed, then each Tensor will correspond to the boxes for an element i - in the batch. - output_size (int or Tuple[int, int]): the size of the output (in bins or pixels) after the pooling - is performed, as (height, width). + If a single Tensor is passed, + then the first column should contain the batch index. If a list of Tensors + is passed, then each Tensor will correspond to the boxes for an element i + in a batch + output_size (int or Tuple[int, int]): the size of the output after the cropping + is performed, as (height, width) spatial_scale (float): a scaling factor that maps the input coordinates to the box coordinates. Default: 1.0 sampling_ratio (int): number of sampling points in the interpolation grid used to compute the output value of each pooled output bin. If > 0, - then exactly ``sampling_ratio x sampling_ratio`` sampling points per bin are used. If + then exactly sampling_ratio x sampling_ratio grid points are used. If <= 0, then an adaptive number of grid points are used (computed as - ``ceil(roi_width / output_width)``, and likewise for height). Default: -1 + ceil(roi_width / pooled_w), and likewise for height). Default: -1 aligned (bool): If False, use the legacy implementation. - If True, pixel shift the box coordinates it by -0.5 for a better alignment with the two - neighboring pixel indices. This version is used in Detectron2 + If True, pixel shift it by -0.5 for align more perfectly about two neighboring pixel indices. + This version in Detectron2 Returns: Tensor[K, C, output_size[0], output_size[1]]: The pooled RoIs.