From bd7f639be51480416c8fb3e36accb035ff1f5208 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nicolashug@fb.com>
Date: Wed, 31 Mar 2021 16:36:54 +0100
Subject: [PATCH 01/26] WIP

---
 .../ops/quantized/cpu/qroi_align_kernel.cpp   | 286 ++++++++++++++++++
 1 file changed, 286 insertions(+)
 create mode 100644 torchvision/csrc/ops/quantized/cpu/qroi_align_kernel.cpp
diff --git a/torchvision/csrc/ops/quantized/cpu/qroi_align_kernel.cpp b/torchvision/csrc/ops/quantized/cpu/qroi_align_kernel.cpp
new file mode 100644
index 00000000000..2926259a70a
--- /dev/null
+++ b/torchvision/csrc/ops/quantized/cpu/qroi_align_kernel.cpp
@@ -0,0 +1,286 @@
+#include <ATen/ATen.h>
+#include <torch/library.h>
+#include <ATen/native/quantized/affine_quantizer.h>
+#include <ATen/quantized/Quantizer.h>
+#include <ATen/cpu/vec256/vec256.h>
+
+namespace vision {
+namespace ops {
+
+namespace {
+
+template <typename T>
+struct PreCalc {
+  int pos1;
+  int pos2;
+  int pos3;
+  int pos4;
+  T w1;
+  T w2;
+  T w3;
+  T w4;
+};
+
+template <typename T>
+void pre_calc_for_bilinear_interpolate(
+    int height,
+    int width,
+    int pooled_height,
+    int pooled_width,
+    int iy_upper,
+    int ix_upper,
+    T roi_start_h,
+    T roi_start_w,
+    T bin_size_h,
+    T bin_size_w,
+    int roi_bin_grid_h,
+    int roi_bin_grid_w,
+    std::vector<PreCalc<T>>& pre_calc) {
+  int pre_calc_index = 0;
+  for (int ph = 0; ph < pooled_height; ph++) {
+    for (int pw = 0; pw < pooled_width; pw++) {
+      for (int iy = 0; iy < iy_upper; iy++) {
+        const T yy = roi_start_h + ph * bin_size_h +
+            static_cast<T>(iy + .5f) * bin_size_h /
+                static_cast<T>(roi_bin_grid_h); // e.g., 0.5, 1.5
+        for (int ix = 0; ix < ix_upper; ix++) {
+          const T xx = roi_start_w + pw * bin_size_w +
+              static_cast<T>(ix + .5f) * bin_size_w /
+                  static_cast<T>(roi_bin_grid_w);
+
+          T x = xx;
+          T y = yy;
+          // deal with: inverse elements are out of feature map boundary
+          if (y < -1.0 || y > height || x < -1.0 || x > width) {
+            // empty
+            PreCalc<T> pc;
+            pc.pos1 = 0;
+            pc.pos2 = 0;
+            pc.pos3 = 0;
+            pc.pos4 = 0;
+            pc.w1 = 0;
+            pc.w2 = 0;
+            pc.w3 = 0;
+            pc.w4 = 0;
+            pre_calc[pre_calc_index] = pc;
+            pre_calc_index += 1;
+            continue;
+          }
+
+          if (y <= 0) {
+            y = 0;
+          }
+          if (x <= 0) {
+            x = 0;
+          }
+
+          int y_low = (int)y;
+          int x_low = (int)x;
+          int y_high;
+          int x_high;
+
+          if (y_low >= height - 1) {
+            y_high = y_low = height - 1;
+            y = (T)y_low;
+          } else {
+            y_high = y_low + 1;
+          }
+
+          if (x_low >= width - 1) {
+            x_high = x_low = width - 1;
+            x = (T)x_low;
+          } else {
+            x_high = x_low + 1;
+          }
+
+          T ly = y - y_low;
+          T lx = x - x_low;
+          T hy = 1. - ly, hx = 1. - lx;
+          T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+
+          // save weights and indeces
+          PreCalc<T> pc;
+          pc.pos1 = y_low * width + x_low;
+          pc.pos2 = y_low * width + x_high;
+          pc.pos3 = y_high * width + x_low;
+          pc.pos4 = y_high * width + x_high;
+          pc.w1 = w1;
+          pc.w2 = w2;
+          pc.w3 = w3;
+          pc.w4 = w4;
+          pre_calc[pre_calc_index] = pc;
+
+          pre_calc_index += 1;
+        }
+      }
+    }
+  }
+}
+
+
+template <typename T>
+void qroi_align_forward_kernel_impl(
+    int n_rois,
+    const T* input,
+    double & spatial_scale,
+    int channels,
+    int height,
+    int width,
+    int pooled_height,
+    int pooled_width,
+    int sampling_ratio,
+    bool aligned,
+    const T* rois,
+    T* output,
+    int64_t output_size) {
+
+  for (int64_t i = 0; i < output_size; i++) {
+      output[i].val_ = 0;
+  }
+
+  for (int n = 0; n < n_rois; n++) {
+    int index_n = n * channels * pooled_width * pooled_height;
+
+    const T* offset_rois = rois + n * 5;
+    int roi_batch_ind = offset_rois[0].val_;
+
+    // Do not using rounding; this implementation detail is critical
+    // T offset = aligned ? (T)0.5 : (T)0.0;
+    int offset = 0;  // TODO fix this
+    float roi_start_w = offset_rois[1].val_ * spatial_scale - offset;
+    float roi_start_h = offset_rois[2].val_ * spatial_scale - offset;
+    float roi_end_w = offset_rois[3].val_ * spatial_scale - offset;
+    float roi_end_h = offset_rois[4].val_ * spatial_scale - offset;
+
+    float roi_width = roi_end_w - roi_start_w;
+    float roi_height = roi_end_h - roi_start_h;
+    if (!aligned) {
+      // Force malformed ROIs to be 1x1
+      roi_width = std::max(roi_width, 1.f);
+      roi_height = std::max(roi_height, 1.f);
+    }
+
+    float bin_size_h = roi_height / pooled_height;
+    float bin_size_w = roi_width / pooled_width;
+
+    // We use roi_bin_grid to sample the grid and mimic integral
+    int roi_bin_grid_h = (sampling_ratio > 0)
+        ? sampling_ratio
+        : ceil(roi_height / pooled_height); // e.g., = 2
+    int roi_bin_grid_w =
+        (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
+
+    // We do average (integral) pooling inside a bin
+    // When the grid is empty, output zeros.
+    const int count = std::max(roi_bin_grid_h * roi_bin_grid_w, 1); // e.g. = 4
+
+    // we want to precalculate indeces and weights shared by all chanels,
+    // this is the key point of optimiation
+    std::vector<PreCalc<float>> pre_calc(
+        roi_bin_grid_h * roi_bin_grid_w * pooled_width * pooled_height);
+    pre_calc_for_bilinear_interpolate(
+        height,
+        width,
+        pooled_height,
+        pooled_width,
+        roi_bin_grid_h,
+        roi_bin_grid_w,
+        roi_start_h,
+        roi_start_w,
+        bin_size_h,
+        bin_size_w,
+        roi_bin_grid_h,
+        roi_bin_grid_w,
+        pre_calc);
+
+    for (int c = 0; c < channels; c++) {
+      int index_n_c = index_n + c * pooled_width * pooled_height;
+      const T* offset_input =
+          input + (roi_batch_ind * channels + c) * height * width;
+      int pre_calc_index = 0;
+
+      for (int ph = 0; ph < pooled_height; ph++) {
+        for (int pw = 0; pw < pooled_width; pw++) {
+          int index = index_n_c + ph * pooled_width + pw;
+
+          float output_val = 0.;
+          for (int iy = 0; iy < roi_bin_grid_h; iy++) {
+            for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+              PreCalc<float> pc = pre_calc[pre_calc_index];
+              output_val += pc.w1 * offset_input[pc.pos1].val_ +
+                  pc.w2 * offset_input[pc.pos2].val_ +
+                  pc.w3 * offset_input[pc.pos3].val_ + pc.w4 * offset_input[pc.pos4].val_;
+
+              pre_calc_index += 1;
+            }
+          }
+          output_val /= count;
+
+          output[index] = at::native::requantize_from_int<T>(1.f,  0, (int)output_val);  // TODO: this is wrong need to set scale and zero etc.
+        } // for pw
+      } // for ph
+    } // for c
+  } // for n
+    
+}
+
+at::Tensor qroi_align_forward_kernel(
+    const at::Tensor& input,
+    const at::Tensor& rois,
+    double spatial_scale,
+    int64_t pooled_height,
+    int64_t pooled_width,
+    int64_t sampling_ratio,
+    bool aligned) {
+  TORCH_CHECK(input.device().is_cpu(), "input must be a CPU tensor");
+  TORCH_CHECK(rois.device().is_cpu(), "rois must be a CPU tensor");
+  TORCH_CHECK(rois.size(1) == 5, "rois must have shape as Tensor[K, 5]");
+
+  at::TensorArg input_t{input, "input", 1}, rois_t{rois, "rois", 2};
+
+  at::CheckedFrom c = "qroi_align_forward_kernel";
+  at::checkAllSameType(c, {input_t, rois_t});
+
+  auto num_rois = rois.size(0);
+  auto channels = input.size(1);
+  auto height = input.size(2);
+  auto width = input.size(3);
+
+  // TODO: This should really be initialized to zero, not empty
+  at::Tensor output = at::_empty_affine_quantized(
+      {num_rois, channels, pooled_height, pooled_width}, input.options());
+
+  if (output.numel() == 0)
+    return output;
+
+  auto input_ = input.contiguous(), rois_ = rois.contiguous();
+  AT_DISPATCH_QINT_TYPES(
+      input.scalar_type(), "qroi_align_forward_kernel", [&] {
+        qroi_align_forward_kernel_impl<scalar_t>(
+            num_rois,
+            input_.data_ptr<scalar_t>(),
+            spatial_scale,
+            channels,
+            height,
+            width,
+            pooled_height,
+            pooled_width,
+            sampling_ratio,
+            aligned,
+            rois_.data_ptr<scalar_t>(),
+            output.data_ptr<scalar_t>(),
+            output.numel());
+      });
+  return output;
+}
+
+} // namespace
+
+TORCH_LIBRARY_IMPL(torchvision, QuantizedCPU, m) {
+  m.impl(
+      TORCH_SELECTIVE_NAME("torchvision::roi_align"),
+      TORCH_FN(qroi_align_forward_kernel));
+}
+
+} // namespace ops
+} // namespace vision
\ No newline at end of file

From 8d21449f8c4b56c3c394d1edc41cf7b37b0a4240 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nicolashug@fb.com>
Date: Thu, 1 Apr 2021 09:40:16 +0100
Subject: [PATCH 02/26] clang

---
 torchvision/csrc/ops/quantized/cpu/qroi_align_kernel.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torchvision/csrc/ops/quantized/cpu/qroi_align_kernel.cpp b/torchvision/csrc/ops/quantized/cpu/qroi_align_kernel.cpp
index 2926259a70a..e9f0dc9546d 100644
--- a/torchvision/csrc/ops/quantized/cpu/qroi_align_kernel.cpp
+++ b/torchvision/csrc/ops/quantized/cpu/qroi_align_kernel.cpp
@@ -283,4 +283,4 @@ TORCH_LIBRARY_IMPL(torchvision, QuantizedCPU, m) {
 }
 
 } // namespace ops
-} // namespace vision
\ No newline at end of file
+} // namespace vision

From 68b0dd8f11dcc87d8f034e645a528893db8762ff Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nicolashug@fb.com>
Date: Fri, 2 Apr 2021 22:34:58 +0100
Subject: [PATCH 03/26] docs

---
 torchvision/ops/roi_align.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/torchvision/ops/roi_align.py b/torchvision/ops/roi_align.py
index 0f6c0be1729..c062c6f91f3 100644
--- a/torchvision/ops/roi_align.py
+++ b/torchvision/ops/roi_align.py
@@ -17,7 +17,7 @@ def roi_align(
     aligned: bool = False,
 ) -> Tensor:
     """
-    Performs Region of Interest (RoI) Align operator described in Mask R-CNN
+    Performs Region of Interest (RoI) Align operator with average pooling, as described in Mask R-CNN.
 
     Args:
         input (Tensor[N, C, H, W]): input tensor
@@ -27,19 +27,19 @@ def roi_align(
             If a single Tensor is passed,
             then the first column should contain the batch index. If a list of Tensors
             is passed, then each Tensor will correspond to the boxes for an element i
-            in a batch
-        output_size (int or Tuple[int, int]): the size of the output after the cropping
-            is performed, as (height, width)
+            in a batch.
+        output_size (int or Tuple[int, int]): the size of the output after the pooling
+            is performed, as (height, width).
         spatial_scale (float): a scaling factor that maps the input coordinates to
             the box coordinates. Default: 1.0
         sampling_ratio (int): number of sampling points in the interpolation grid
             used to compute the output value of each pooled output bin. If > 0,
-            then exactly sampling_ratio x sampling_ratio grid points are used. If
+            then exactly ``sampling_ratio x sampling_ratio`` sampling points per bin are used. If
             <= 0, then an adaptive number of grid points are used (computed as
-            ceil(roi_width / pooled_w), and likewise for height). Default: -1
+            ``ceil(roi_width / output_width)``, and likewise for height). Default: -1
         aligned (bool): If False, use the legacy implementation.
             If True, pixel shift it by -0.5 for align more perfectly about two neighboring pixel indices.
-            This version in Detectron2
+            This version is used in Detectron2
 
     Returns:
         output (Tensor[K, C, output_size[0], output_size[1]])

From c115b7385bc4bec600c149fa09a5015dbf2d0201 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nicolashug@fb.com>
Date: Fri, 2 Apr 2021 22:35:14 +0100
Subject: [PATCH 04/26] extracted out common utils

---
 torchvision/csrc/ops/cpu/roi_align_kernel.cpp | 123 +--------------
 .../ops/quantized/cpu/qroi_align_kernel.cpp   | 143 ++----------------
 torchvision/csrc/ops/roi_align.h              | 107 +++++++++++++
 3 files changed, 127 insertions(+), 246 deletions(-)

diff --git a/torchvision/csrc/ops/cpu/roi_align_kernel.cpp b/torchvision/csrc/ops/cpu/roi_align_kernel.cpp
index dc0c38cd314..a70e5d5d630 100644
--- a/torchvision/csrc/ops/cpu/roi_align_kernel.cpp
+++ b/torchvision/csrc/ops/cpu/roi_align_kernel.cpp
@@ -1,120 +1,13 @@
 #include <ATen/ATen.h>
 #include <torch/library.h>
 
+#include "../roi_align.h"
+
 namespace vision {
 namespace ops {
 
 namespace {
 
-// implementation taken from Caffe2
-template <typename T>
-struct PreCalc {
-  int pos1;
-  int pos2;
-  int pos3;
-  int pos4;
-  T w1;
-  T w2;
-  T w3;
-  T w4;
-};
-
-template <typename T>
-void pre_calc_for_bilinear_interpolate(
-    int height,
-    int width,
-    int pooled_height,
-    int pooled_width,
-    int iy_upper,
-    int ix_upper,
-    T roi_start_h,
-    T roi_start_w,
-    T bin_size_h,
-    T bin_size_w,
-    int roi_bin_grid_h,
-    int roi_bin_grid_w,
-    std::vector<PreCalc<T>>& pre_calc) {
-  int pre_calc_index = 0;
-  for (int ph = 0; ph < pooled_height; ph++) {
-    for (int pw = 0; pw < pooled_width; pw++) {
-      for (int iy = 0; iy < iy_upper; iy++) {
-        const T yy = roi_start_h + ph * bin_size_h +
-            static_cast<T>(iy + .5f) * bin_size_h /
-                static_cast<T>(roi_bin_grid_h); // e.g., 0.5, 1.5
-        for (int ix = 0; ix < ix_upper; ix++) {
-          const T xx = roi_start_w + pw * bin_size_w +
-              static_cast<T>(ix + .5f) * bin_size_w /
-                  static_cast<T>(roi_bin_grid_w);
-
-          T x = xx;
-          T y = yy;
-          // deal with: inverse elements are out of feature map boundary
-          if (y < -1.0 || y > height || x < -1.0 || x > width) {
-            // empty
-            PreCalc<T> pc;
-            pc.pos1 = 0;
-            pc.pos2 = 0;
-            pc.pos3 = 0;
-            pc.pos4 = 0;
-            pc.w1 = 0;
-            pc.w2 = 0;
-            pc.w3 = 0;
-            pc.w4 = 0;
-            pre_calc[pre_calc_index] = pc;
-            pre_calc_index += 1;
-            continue;
-          }
-
-          if (y <= 0) {
-            y = 0;
-          }
-          if (x <= 0) {
-            x = 0;
-          }
-
-          int y_low = (int)y;
-          int x_low = (int)x;
-          int y_high;
-          int x_high;
-
-          if (y_low >= height - 1) {
-            y_high = y_low = height - 1;
-            y = (T)y_low;
-          } else {
-            y_high = y_low + 1;
-          }
-
-          if (x_low >= width - 1) {
-            x_high = x_low = width - 1;
-            x = (T)x_low;
-          } else {
-            x_high = x_low + 1;
-          }
-
-          T ly = y - y_low;
-          T lx = x - x_low;
-          T hy = 1. - ly, hx = 1. - lx;
-          T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
-
-          // save weights and indeces
-          PreCalc<T> pc;
-          pc.pos1 = y_low * width + x_low;
-          pc.pos2 = y_low * width + x_high;
-          pc.pos3 = y_high * width + x_low;
-          pc.pos4 = y_high * width + x_high;
-          pc.w1 = w1;
-          pc.w2 = w2;
-          pc.w3 = w3;
-          pc.w4 = w4;
-          pre_calc[pre_calc_index] = pc;
-
-          pre_calc_index += 1;
-        }
-      }
-    }
-  }
-}
-
 template <typename T>
 void roi_align_forward_kernel_impl(
     int n_rois,
@@ -167,17 +60,15 @@ void roi_align_forward_kernel_impl(
     // When the grid is empty, output zeros.
     const T count = std::max(roi_bin_grid_h * roi_bin_grid_w, 1); // e.g. = 4
 
-    // we want to precalculate indeces and weights shared by all chanels,
-    // this is the key point of optimiation
-    std::vector<PreCalc<T>> pre_calc(
+    // we want to precalculate indices and weights shared by all chanels,
+    // this is the key point of optimization
+    std::vector<detail::PreCalc<T>> pre_calc(
         roi_bin_grid_h * roi_bin_grid_w * pooled_width * pooled_height);
-    pre_calc_for_bilinear_interpolate(
+    detail::pre_calc_for_bilinear_interpolate(
         height,
         width,
         pooled_height,
         pooled_width,
-        roi_bin_grid_h,
-        roi_bin_grid_w,
         roi_start_h,
         roi_start_w,
         bin_size_h,
@@ -199,7 +90,7 @@ void roi_align_forward_kernel_impl(
           T output_val = 0.;
           for (int iy = 0; iy < roi_bin_grid_h; iy++) {
             for (int ix = 0; ix < roi_bin_grid_w; ix++) {
-              PreCalc<T> pc = pre_calc[pre_calc_index];
+              detail::PreCalc<T> pc = pre_calc[pre_calc_index];
               output_val += pc.w1 * offset_input[pc.pos1] +
                   pc.w2 * offset_input[pc.pos2] +
                   pc.w3 * offset_input[pc.pos3] + pc.w4 * offset_input[pc.pos4];
diff --git a/torchvision/csrc/ops/quantized/cpu/qroi_align_kernel.cpp b/torchvision/csrc/ops/quantized/cpu/qroi_align_kernel.cpp
index e9f0dc9546d..55086ae4ec4 100644
--- a/torchvision/csrc/ops/quantized/cpu/qroi_align_kernel.cpp
+++ b/torchvision/csrc/ops/quantized/cpu/qroi_align_kernel.cpp
@@ -4,125 +4,18 @@
 #include <ATen/quantized/Quantizer.h>
 #include <ATen/cpu/vec256/vec256.h>
 
+#include "../../roi_align.h"
+
 namespace vision {
 namespace ops {
 
 namespace {
 
-template <typename T>
-struct PreCalc {
-  int pos1;
-  int pos2;
-  int pos3;
-  int pos4;
-  T w1;
-  T w2;
-  T w3;
-  T w4;
-};
-
-template <typename T>
-void pre_calc_for_bilinear_interpolate(
-    int height,
-    int width,
-    int pooled_height,
-    int pooled_width,
-    int iy_upper,
-    int ix_upper,
-    T roi_start_h,
-    T roi_start_w,
-    T bin_size_h,
-    T bin_size_w,
-    int roi_bin_grid_h,
-    int roi_bin_grid_w,
-    std::vector<PreCalc<T>>& pre_calc) {
-  int pre_calc_index = 0;
-  for (int ph = 0; ph < pooled_height; ph++) {
-    for (int pw = 0; pw < pooled_width; pw++) {
-      for (int iy = 0; iy < iy_upper; iy++) {
-        const T yy = roi_start_h + ph * bin_size_h +
-            static_cast<T>(iy + .5f) * bin_size_h /
-                static_cast<T>(roi_bin_grid_h); // e.g., 0.5, 1.5
-        for (int ix = 0; ix < ix_upper; ix++) {
-          const T xx = roi_start_w + pw * bin_size_w +
-              static_cast<T>(ix + .5f) * bin_size_w /
-                  static_cast<T>(roi_bin_grid_w);
-
-          T x = xx;
-          T y = yy;
-          // deal with: inverse elements are out of feature map boundary
-          if (y < -1.0 || y > height || x < -1.0 || x > width) {
-            // empty
-            PreCalc<T> pc;
-            pc.pos1 = 0;
-            pc.pos2 = 0;
-            pc.pos3 = 0;
-            pc.pos4 = 0;
-            pc.w1 = 0;
-            pc.w2 = 0;
-            pc.w3 = 0;
-            pc.w4 = 0;
-            pre_calc[pre_calc_index] = pc;
-            pre_calc_index += 1;
-            continue;
-          }
-
-          if (y <= 0) {
-            y = 0;
-          }
-          if (x <= 0) {
-            x = 0;
-          }
-
-          int y_low = (int)y;
-          int x_low = (int)x;
-          int y_high;
-          int x_high;
-
-          if (y_low >= height - 1) {
-            y_high = y_low = height - 1;
-            y = (T)y_low;
-          } else {
-            y_high = y_low + 1;
-          }
-
-          if (x_low >= width - 1) {
-            x_high = x_low = width - 1;
-            x = (T)x_low;
-          } else {
-            x_high = x_low + 1;
-          }
-
-          T ly = y - y_low;
-          T lx = x - x_low;
-          T hy = 1. - ly, hx = 1. - lx;
-          T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
-
-          // save weights and indeces
-          PreCalc<T> pc;
-          pc.pos1 = y_low * width + x_low;
-          pc.pos2 = y_low * width + x_high;
-          pc.pos3 = y_high * width + x_low;
-          pc.pos4 = y_high * width + x_high;
-          pc.w1 = w1;
-          pc.w2 = w2;
-          pc.w3 = w3;
-          pc.w4 = w4;
-          pre_calc[pre_calc_index] = pc;
-
-          pre_calc_index += 1;
-        }
-      }
-    }
-  }
-}
-
-
 template <typename T>
 void qroi_align_forward_kernel_impl(
     int n_rois,
     const T* input,
-    double & spatial_scale,
+    const float & spatial_scale,
     int channels,
     int height,
     int width,
@@ -131,22 +24,16 @@ void qroi_align_forward_kernel_impl(
     int sampling_ratio,
     bool aligned,
     const T* rois,
-    T* output,
-    int64_t output_size) {
-
-  for (int64_t i = 0; i < output_size; i++) {
-      output[i].val_ = 0;
-  }
+    T* output){
 
   for (int n = 0; n < n_rois; n++) {
     int index_n = n * channels * pooled_width * pooled_height;
 
     const T* offset_rois = rois + n * 5;
-    int roi_batch_ind = offset_rois[0].val_;
+    int roi_batch_ind = offset_rois[0].val_; // FIXME: This can be out of the range of the quantized type!!
 
     // Do not using rounding; this implementation detail is critical
-    // T offset = aligned ? (T)0.5 : (T)0.0;
-    int offset = 0;  // TODO fix this
+    float offset = aligned ? 0.5 : 0.;
     float roi_start_w = offset_rois[1].val_ * spatial_scale - offset;
     float roi_start_h = offset_rois[2].val_ * spatial_scale - offset;
     float roi_end_w = offset_rois[3].val_ * spatial_scale - offset;
@@ -172,19 +59,17 @@ void qroi_align_forward_kernel_impl(
 
     // We do average (integral) pooling inside a bin
     // When the grid is empty, output zeros.
-    const int count = std::max(roi_bin_grid_h * roi_bin_grid_w, 1); // e.g. = 4
+    const float count = std::max(roi_bin_grid_h * roi_bin_grid_w, 1); // e.g. = 4
 
-    // we want to precalculate indeces and weights shared by all chanels,
-    // this is the key point of optimiation
-    std::vector<PreCalc<float>> pre_calc(
+    // we want to precalculate indices and weights shared by all chanels,
+    // this is the key point of optimization
+    std::vector<detail::PreCalc<float>> pre_calc(
         roi_bin_grid_h * roi_bin_grid_w * pooled_width * pooled_height);
-    pre_calc_for_bilinear_interpolate(
+    detail::pre_calc_for_bilinear_interpolate(
         height,
         width,
         pooled_height,
         pooled_width,
-        roi_bin_grid_h,
-        roi_bin_grid_w,
         roi_start_h,
         roi_start_w,
         bin_size_h,
@@ -206,7 +91,7 @@ void qroi_align_forward_kernel_impl(
           float output_val = 0.;
           for (int iy = 0; iy < roi_bin_grid_h; iy++) {
             for (int ix = 0; ix < roi_bin_grid_w; ix++) {
-              PreCalc<float> pc = pre_calc[pre_calc_index];
+              detail::PreCalc<float> pc = pre_calc[pre_calc_index];
               output_val += pc.w1 * offset_input[pc.pos1].val_ +
                   pc.w2 * offset_input[pc.pos2].val_ +
                   pc.w3 * offset_input[pc.pos3].val_ + pc.w4 * offset_input[pc.pos4].val_;
@@ -246,7 +131,6 @@ at::Tensor qroi_align_forward_kernel(
   auto height = input.size(2);
   auto width = input.size(3);
 
-  // TODO: This should really be initialized to zero, not empty
   at::Tensor output = at::_empty_affine_quantized(
       {num_rois, channels, pooled_height, pooled_width}, input.options());
 
@@ -268,8 +152,7 @@ at::Tensor qroi_align_forward_kernel(
             sampling_ratio,
             aligned,
             rois_.data_ptr<scalar_t>(),
-            output.data_ptr<scalar_t>(),
-            output.numel());
+            output.data_ptr<scalar_t>());
       });
   return output;
 }
diff --git a/torchvision/csrc/ops/roi_align.h b/torchvision/csrc/ops/roi_align.h
index 2ddb6ac3945..ff3b3596120 100644
--- a/torchvision/csrc/ops/roi_align.h
+++ b/torchvision/csrc/ops/roi_align.h
@@ -30,6 +30,113 @@ at::Tensor _roi_align_backward(
     int64_t sampling_ratio,
     bool aligned);
 
+template <typename T>
+struct PreCalc {
+  int pos1;
+  int pos2;
+  int pos3;
+  int pos4;
+  T w1;
+  T w2;
+  T w3;
+  T w4;
+};
+
+// implementation taken from Caffe2
+template <typename T>
+void pre_calc_for_bilinear_interpolate(
+    int height,
+    int width,
+    int pooled_height,
+    int pooled_width,
+    T roi_start_h,
+    T roi_start_w,
+    T bin_size_h,
+    T bin_size_w,
+    int roi_bin_grid_h,
+    int roi_bin_grid_w,
+    std::vector<PreCalc<T>>& pre_calc) {
+  int pre_calc_index = 0;
+  for (int ph = 0; ph < pooled_height; ph++) {
+    for (int pw = 0; pw < pooled_width; pw++) {
+      for (int iy = 0; iy < roi_bin_grid_h; iy++) {
+        const T yy = roi_start_h + ph * bin_size_h +
+            static_cast<T>(iy + .5f) * bin_size_h /
+                static_cast<T>(roi_bin_grid_h); // e.g., 0.5, 1.5
+        for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+          const T xx = roi_start_w + pw * bin_size_w +
+              static_cast<T>(ix + .5f) * bin_size_w /
+                  static_cast<T>(roi_bin_grid_w);
+
+          T x = xx;
+          T y = yy;
+          // deal with: inverse elements are out of feature map boundary
+          if (y < -1.0 || y > height || x < -1.0 || x > width) {
+            // empty
+            PreCalc<T> pc;
+            pc.pos1 = 0;
+            pc.pos2 = 0;
+            pc.pos3 = 0;
+            pc.pos4 = 0;
+            pc.w1 = 0;
+            pc.w2 = 0;
+            pc.w3 = 0;
+            pc.w4 = 0;
+            pre_calc[pre_calc_index] = pc;
+            pre_calc_index += 1;
+            continue;
+          }
+
+          if (y <= 0) {
+            y = 0;
+          }
+          if (x <= 0) {
+            x = 0;
+          }
+
+          int y_low = (int)y;
+          int x_low = (int)x;
+          int y_high;
+          int x_high;
+
+          if (y_low >= height - 1) {
+            y_high = y_low = height - 1;
+            y = (T)y_low;
+          } else {
+            y_high = y_low + 1;
+          }
+
+          if (x_low >= width - 1) {
+            x_high = x_low = width - 1;
+            x = (T)x_low;
+          } else {
+            x_high = x_low + 1;
+          }
+
+          T ly = y - y_low;
+          T lx = x - x_low;
+          T hy = 1. - ly, hx = 1. - lx;
+          T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+
+          // save weights and indeces
+          PreCalc<T> pc;
+          pc.pos1 = y_low * width + x_low;
+          pc.pos2 = y_low * width + x_high;
+          pc.pos3 = y_high * width + x_low;
+          pc.pos4 = y_high * width + x_high;
+          pc.w1 = w1;
+          pc.w2 = w2;
+          pc.w3 = w3;
+          pc.w4 = w4;
+          pre_calc[pre_calc_index] = pc;
+
+          pre_calc_index += 1;
+        }
+      }
+    }
+  }
+}
+
 } // namespace detail
 
 } // namespace ops

From aadd2fc210992fbf1e4f22b8ebf54de96c0ae897 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nicolashug@fb.com>
Date: Sat, 3 Apr 2021 13:21:32 +0100
Subject: [PATCH 05/26] Use better quantization function and pass tensors as
 parameters

---
 .../ops/quantized/cpu/qroi_align_kernel.cpp   | 23 +++++++++++--------
 torchvision/csrc/ops/roi_align.h              | 10 ++++++++
 2 files changed, 23 insertions(+), 10 deletions(-)

diff --git a/torchvision/csrc/ops/quantized/cpu/qroi_align_kernel.cpp b/torchvision/csrc/ops/quantized/cpu/qroi_align_kernel.cpp
index 55086ae4ec4..002ba408729 100644
--- a/torchvision/csrc/ops/quantized/cpu/qroi_align_kernel.cpp
+++ b/torchvision/csrc/ops/quantized/cpu/qroi_align_kernel.cpp
@@ -1,8 +1,6 @@
 #include <ATen/ATen.h>
 #include <torch/library.h>
 #include <ATen/native/quantized/affine_quantizer.h>
-#include <ATen/quantized/Quantizer.h>
-#include <ATen/cpu/vec256/vec256.h>
 
 #include "../../roi_align.h"
 
@@ -14,7 +12,7 @@ namespace {
 template <typename T>
 void qroi_align_forward_kernel_impl(
     int n_rois,
-    const T* input,
+    const at::Tensor& t_input,
     const float & spatial_scale,
     int channels,
     int height,
@@ -23,9 +21,12 @@ void qroi_align_forward_kernel_impl(
     int pooled_width,
     int sampling_ratio,
     bool aligned,
-    const T* rois,
+    const at::Tensor& t_rois,
     T* output){
-
+  
+  const T* input = t_input.contiguous().data_ptr<T>();
+  const T* rois = t_rois.contiguous().data_ptr<T>();
+  
   for (int n = 0; n < n_rois; n++) {
     int index_n = n * channels * pooled_width * pooled_height;
 
@@ -101,7 +102,7 @@ void qroi_align_forward_kernel_impl(
           }
           output_val /= count;
 
-          output[index] = at::native::requantize_from_int<T>(1.f,  0, (int)output_val);  // TODO: this is wrong need to set scale and zero etc.
+          output[index] = at::native::quantize_val<T>(1.f,  0, output_val);  // TODO: this is wrong need to set scale and zero etc.
         } // for pw
       } // for ph
     } // for c
@@ -131,18 +132,20 @@ at::Tensor qroi_align_forward_kernel(
   auto height = input.size(2);
   auto width = input.size(3);
 
+  // FIXME: This is private, API might change:
+  // https://github.com/pytorch/pytorch/wiki/Introducing-Quantized-Tensor#quantized-tensor-apis
   at::Tensor output = at::_empty_affine_quantized(
-      {num_rois, channels, pooled_height, pooled_width}, input.options());
+      {num_rois, channels, pooled_height, pooled_width}, input.options(),
+       input.q_scale(), input.q_zero_point());
 
   if (output.numel() == 0)
     return output;
 
-  auto input_ = input.contiguous(), rois_ = rois.contiguous();
   AT_DISPATCH_QINT_TYPES(
       input.scalar_type(), "qroi_align_forward_kernel", [&] {
         qroi_align_forward_kernel_impl<scalar_t>(
             num_rois,
-            input_.data_ptr<scalar_t>(),
+            input,
             spatial_scale,
             channels,
             height,
@@ -151,7 +154,7 @@ at::Tensor qroi_align_forward_kernel(
             pooled_width,
             sampling_ratio,
             aligned,
-            rois_.data_ptr<scalar_t>(),
+            rois,
             output.data_ptr<scalar_t>());
       });
   return output;
diff --git a/torchvision/csrc/ops/roi_align.h b/torchvision/csrc/ops/roi_align.h
index ff3b3596120..20c5684c87c 100644
--- a/torchvision/csrc/ops/roi_align.h
+++ b/torchvision/csrc/ops/roi_align.h
@@ -42,6 +42,16 @@ struct PreCalc {
   T w4;
 };
 
+// This helper computes the interpolation weights (w1, w2...) for every sampling
+// point of a given box. There are pool_height * pool_width * roi_bin_grid_h *
+// roi_bin_grid_w such sampling points.
+// 
+// The weights (w1, w2...) are computed as the areas in this figure:
+// https://en.wikipedia.org/wiki/Bilinear_interpolation#/media/File:Bilinear_interpolation_visualisation.svg
+// and pos1, pos2 etc correspond to the indices of their respective pixels.
+//
+// Note: the weights and indices are shared across all channels, which is why
+// they are pre-calculated prior to the main loop in the RoIAlign kernel.
 // implementation taken from Caffe2
 template <typename T>
 void pre_calc_for_bilinear_interpolate(

From 81a320778b9606386a71e9f7d1179b43ce9a0f1b Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nicolashug@fb.com>
Date: Sat, 3 Apr 2021 14:11:48 +0100
Subject: [PATCH 06/26] proper dequantization

---
 .../ops/quantized/cpu/qroi_align_kernel.cpp   | 27 ++++++++++++-------
 1 file changed, 17 insertions(+), 10 deletions(-)

diff --git a/torchvision/csrc/ops/quantized/cpu/qroi_align_kernel.cpp b/torchvision/csrc/ops/quantized/cpu/qroi_align_kernel.cpp
index 002ba408729..a2e71129f9b 100644
--- a/torchvision/csrc/ops/quantized/cpu/qroi_align_kernel.cpp
+++ b/torchvision/csrc/ops/quantized/cpu/qroi_align_kernel.cpp
@@ -25,20 +25,25 @@ void qroi_align_forward_kernel_impl(
     T* output){
   
   const T* input = t_input.contiguous().data_ptr<T>();
+  int64_t input_zp = t_input.q_zero_point();
+  float input_scale = t_input.q_scale();
+
   const T* rois = t_rois.contiguous().data_ptr<T>();
+  int64_t rois_zp = t_rois.q_zero_point();
+  float rois_scale = t_rois.q_scale();
   
   for (int n = 0; n < n_rois; n++) {
     int index_n = n * channels * pooled_width * pooled_height;
 
     const T* offset_rois = rois + n * 5;
-    int roi_batch_ind = offset_rois[0].val_; // FIXME: This can be out of the range of the quantized type!!
+    int roi_batch_ind = at::native::dequantize_val(rois_scale, rois_zp, offset_rois[0]); // FIXME: This can be out of the range of the quantized type!!
 
     // Do not using rounding; this implementation detail is critical
     float offset = aligned ? 0.5 : 0.;
-    float roi_start_w = offset_rois[1].val_ * spatial_scale - offset;
-    float roi_start_h = offset_rois[2].val_ * spatial_scale - offset;
-    float roi_end_w = offset_rois[3].val_ * spatial_scale - offset;
-    float roi_end_h = offset_rois[4].val_ * spatial_scale - offset;
+    float roi_start_w = at::native::dequantize_val(rois_scale, rois_zp, offset_rois[1]) * spatial_scale - offset;
+    float roi_start_h = at::native::dequantize_val(rois_scale, rois_zp, offset_rois[2]) * spatial_scale - offset;
+    float roi_end_w = at::native::dequantize_val(rois_scale, rois_zp, offset_rois[3]) * spatial_scale - offset;
+    float roi_end_h = at::native::dequantize_val(rois_scale, rois_zp, offset_rois[4]) * spatial_scale - offset;
 
     float roi_width = roi_end_w - roi_start_w;
     float roi_height = roi_end_h - roi_start_h;
@@ -93,21 +98,23 @@ void qroi_align_forward_kernel_impl(
           for (int iy = 0; iy < roi_bin_grid_h; iy++) {
             for (int ix = 0; ix < roi_bin_grid_w; ix++) {
               detail::PreCalc<float> pc = pre_calc[pre_calc_index];
-              output_val += pc.w1 * offset_input[pc.pos1].val_ +
-                  pc.w2 * offset_input[pc.pos2].val_ +
-                  pc.w3 * offset_input[pc.pos3].val_ + pc.w4 * offset_input[pc.pos4].val_;
+
+              output_val +=  // TODO: We can probably optimize the dequantization
+                  pc.w1 * at::native::dequantize_val(input_scale, input_zp, offset_input[pc.pos1]) +
+                  pc.w2 * at::native::dequantize_val(input_scale, input_zp, offset_input[pc.pos2]) +
+                  pc.w3 * at::native::dequantize_val(input_scale, input_zp, offset_input[pc.pos3]) +
+                  pc.w4 * at::native::dequantize_val(input_scale, input_zp, offset_input[pc.pos4]);
 
               pre_calc_index += 1;
             }
           }
           output_val /= count;
 
-          output[index] = at::native::quantize_val<T>(1.f,  0, output_val);  // TODO: this is wrong need to set scale and zero etc.
+          output[index] = at::native::quantize_val<T>(input_scale, input_zp, output_val);
         } // for pw
       } // for ph
     } // for c
   } // for n
-    
 }
 
 at::Tensor qroi_align_forward_kernel(

From 295a6ccf9c87c2062cfa35c9f7e08ef9c8e9dfc6 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nicolashug@fb.com>
Date: Sat, 3 Apr 2021 17:19:38 +0100
Subject: [PATCH 07/26] Some tests

---
 test/test_ops.py | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/test/test_ops.py b/test/test_ops.py
index 0031da45cce..fab324b2f3d 100644
--- a/test/test_ops.py
+++ b/test/test_ops.py
@@ -299,6 +299,36 @@ def _test_forward(self, device, contiguous, x_dtype=None, rois_dtype=None, **kwa
         for aligned in (True, False):
             super()._test_forward(device, contiguous, x_dtype, rois_dtype, aligned=aligned)
 
+    def test_qroialign(self):
+        """Make sure quantized version of RoIAlign is close to float version"""
+        pool_size = 5
+        img_size = 10
+        n_channels = 2
+        num_batches = 2
+        dtype = torch.float
+
+        def make_rois(num_rois=1000):
+            rois = torch.randint(0, img_size // 2, size=(num_rois, 5)).to(dtype)
+            rois[:, 0] = torch.randint(0, num_batches, size=(num_rois,))  # set batch index
+            rois[:, 3:] += rois[:, 1:3]  # make sure boxes aren't degenerate
+            return rois
+
+        for scale, zero_point in ((1, 0), (2, 10)):
+            for qdtype in (torch.qint8, torch.quint8, torch.qint32):
+
+                x = torch.randint(0, 100, size=(num_batches, n_channels, img_size, img_size)).to(dtype)
+                qx = torch.quantize_per_tensor(x, scale=scale, zero_point=zero_point, dtype=qdtype)
+
+                rois = make_rois()
+                qrois = torch.quantize_per_tensor(rois, scale=scale, zero_point=zero_point, dtype=qdtype)
+
+                x, rois = qx.dequantize(), qrois.dequantize()
+
+                y = ops.roi_align(x, rois, output_size=pool_size, spatial_scale=1, sampling_ratio=-1)
+                qy = ops.roi_align(qx, qrois, output_size=pool_size, spatial_scale=1, sampling_ratio=-1)
+
+                self.assertTrue(torch.allclose(y, qy.dequantize(), atol=1))
+
 
 class PSRoIAlignTester(RoIOpTester, unittest.TestCase):
     def fn(self, x, rois, pool_h, pool_w, spatial_scale=1, sampling_ratio=-1, **kwargs):

From 626f7900435c21559558562d332754533c2330ef Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nicolashug@fb.com>
Date: Sat, 3 Apr 2021 17:39:36 +0100
Subject: [PATCH 08/26] Dequantization optimization, seems to gain a few ms

---
 torchvision/csrc/ops/cpu/roi_align_kernel.cpp   |  2 +-
 .../ops/quantized/cpu/qroi_align_kernel.cpp     | 17 +++++++++++------
 2 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/torchvision/csrc/ops/cpu/roi_align_kernel.cpp b/torchvision/csrc/ops/cpu/roi_align_kernel.cpp
index a70e5d5d630..619064cb40e 100644
--- a/torchvision/csrc/ops/cpu/roi_align_kernel.cpp
+++ b/torchvision/csrc/ops/cpu/roi_align_kernel.cpp
@@ -98,7 +98,7 @@ void roi_align_forward_kernel_impl(
               pre_calc_index += 1;
             }
           }
-          output_val /= count;
+          output_val /= count; // Average pooling
 
           output[index] = output_val;
         } // for pw
diff --git a/torchvision/csrc/ops/quantized/cpu/qroi_align_kernel.cpp b/torchvision/csrc/ops/quantized/cpu/qroi_align_kernel.cpp
index a2e71129f9b..0c6b8e7f925 100644
--- a/torchvision/csrc/ops/quantized/cpu/qroi_align_kernel.cpp
+++ b/torchvision/csrc/ops/quantized/cpu/qroi_align_kernel.cpp
@@ -95,20 +95,25 @@ void qroi_align_forward_kernel_impl(
           int index = index_n_c + ph * pooled_width + pw;
 
           float output_val = 0.;
+          float sum_w = 0.;
           for (int iy = 0; iy < roi_bin_grid_h; iy++) {
             for (int ix = 0; ix < roi_bin_grid_w; ix++) {
               detail::PreCalc<float> pc = pre_calc[pre_calc_index];
 
-              output_val +=  // TODO: We can probably optimize the dequantization
-                  pc.w1 * at::native::dequantize_val(input_scale, input_zp, offset_input[pc.pos1]) +
-                  pc.w2 * at::native::dequantize_val(input_scale, input_zp, offset_input[pc.pos2]) +
-                  pc.w3 * at::native::dequantize_val(input_scale, input_zp, offset_input[pc.pos3]) +
-                  pc.w4 * at::native::dequantize_val(input_scale, input_zp, offset_input[pc.pos4]);
+              // to optimize computations we use the raw .val_ fields and we'll dequantize later
+              output_val +=
+                  pc.w1 * offset_input[pc.pos1].val_ +
+                  pc.w2 * offset_input[pc.pos2].val_ +
+                  pc.w3 * offset_input[pc.pos3].val_ +
+                  pc.w4 * offset_input[pc.pos4].val_;
+
+              sum_w += pc.w1 + pc.w2 + pc.w3 + pc.w4;
 
               pre_calc_index += 1;
             }
           }
-          output_val /= count;
+          output_val = input_scale * (output_val - input_zp * sum_w);  // dequantization
+          output_val /= count; // Average pooling
 
           output[index] = at::native::quantize_val<T>(input_scale, input_zp, output_val);
         } // for pw

From b1b68f13983e589aa072dbddfda863ce06fabac7 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nicolashug@fb.com>
Date: Sat, 3 Apr 2021 17:44:50 +0100
Subject: [PATCH 09/26] clang-format

---
 .../ops/quantized/cpu/qroi_align_kernel.cpp   | 85 +++++++++++--------
 1 file changed, 51 insertions(+), 34 deletions(-)

diff --git a/torchvision/csrc/ops/quantized/cpu/qroi_align_kernel.cpp b/torchvision/csrc/ops/quantized/cpu/qroi_align_kernel.cpp
index 0c6b8e7f925..81c14f625d2 100644
--- a/torchvision/csrc/ops/quantized/cpu/qroi_align_kernel.cpp
+++ b/torchvision/csrc/ops/quantized/cpu/qroi_align_kernel.cpp
@@ -1,6 +1,6 @@
 #include <ATen/ATen.h>
-#include <torch/library.h>
 #include <ATen/native/quantized/affine_quantizer.h>
+#include <torch/library.h>
 
 #include "../../roi_align.h"
 
@@ -13,7 +13,7 @@ template <typename T>
 void qroi_align_forward_kernel_impl(
     int n_rois,
     const at::Tensor& t_input,
-    const float & spatial_scale,
+    const float& spatial_scale,
     int channels,
     int height,
     int width,
@@ -22,8 +22,7 @@ void qroi_align_forward_kernel_impl(
     int sampling_ratio,
     bool aligned,
     const at::Tensor& t_rois,
-    T* output){
-  
+    T* output) {
   const T* input = t_input.contiguous().data_ptr<T>();
   int64_t input_zp = t_input.q_zero_point();
   float input_scale = t_input.q_scale();
@@ -31,19 +30,33 @@ void qroi_align_forward_kernel_impl(
   const T* rois = t_rois.contiguous().data_ptr<T>();
   int64_t rois_zp = t_rois.q_zero_point();
   float rois_scale = t_rois.q_scale();
-  
+
   for (int n = 0; n < n_rois; n++) {
     int index_n = n * channels * pooled_width * pooled_height;
 
     const T* offset_rois = rois + n * 5;
-    int roi_batch_ind = at::native::dequantize_val(rois_scale, rois_zp, offset_rois[0]); // FIXME: This can be out of the range of the quantized type!!
+    int roi_batch_ind = at::native::dequantize_val(
+        rois_scale, rois_zp, offset_rois[0]); // FIXME: This can be out of the
+                                              // range of the quantized type!!
 
     // Do not using rounding; this implementation detail is critical
     float offset = aligned ? 0.5 : 0.;
-    float roi_start_w = at::native::dequantize_val(rois_scale, rois_zp, offset_rois[1]) * spatial_scale - offset;
-    float roi_start_h = at::native::dequantize_val(rois_scale, rois_zp, offset_rois[2]) * spatial_scale - offset;
-    float roi_end_w = at::native::dequantize_val(rois_scale, rois_zp, offset_rois[3]) * spatial_scale - offset;
-    float roi_end_h = at::native::dequantize_val(rois_scale, rois_zp, offset_rois[4]) * spatial_scale - offset;
+    float roi_start_w =
+        at::native::dequantize_val(rois_scale, rois_zp, offset_rois[1]) *
+            spatial_scale -
+        offset;
+    float roi_start_h =
+        at::native::dequantize_val(rois_scale, rois_zp, offset_rois[2]) *
+            spatial_scale -
+        offset;
+    float roi_end_w =
+        at::native::dequantize_val(rois_scale, rois_zp, offset_rois[3]) *
+            spatial_scale -
+        offset;
+    float roi_end_h =
+        at::native::dequantize_val(rois_scale, rois_zp, offset_rois[4]) *
+            spatial_scale -
+        offset;
 
     float roi_width = roi_end_w - roi_start_w;
     float roi_height = roi_end_h - roi_start_h;
@@ -65,7 +78,8 @@ void qroi_align_forward_kernel_impl(
 
     // We do average (integral) pooling inside a bin
     // When the grid is empty, output zeros.
-    const float count = std::max(roi_bin_grid_h * roi_bin_grid_w, 1); // e.g. = 4
+    const float count =
+        std::max(roi_bin_grid_h * roi_bin_grid_w, 1); // e.g. = 4
 
     // we want to precalculate indices and weights shared by all chanels,
     // this is the key point of optimization
@@ -100,9 +114,9 @@ void qroi_align_forward_kernel_impl(
             for (int ix = 0; ix < roi_bin_grid_w; ix++) {
               detail::PreCalc<float> pc = pre_calc[pre_calc_index];
 
-              // to optimize computations we use the raw .val_ fields and we'll dequantize later
-              output_val +=
-                  pc.w1 * offset_input[pc.pos1].val_ +
+              // to optimize computations we use the raw .val_ fields and we'll
+              // dequantize later
+              output_val += pc.w1 * offset_input[pc.pos1].val_ +
                   pc.w2 * offset_input[pc.pos2].val_ +
                   pc.w3 * offset_input[pc.pos3].val_ +
                   pc.w4 * offset_input[pc.pos4].val_;
@@ -112,10 +126,12 @@ void qroi_align_forward_kernel_impl(
               pre_calc_index += 1;
             }
           }
-          output_val = input_scale * (output_val - input_zp * sum_w);  // dequantization
+          output_val =
+              input_scale * (output_val - input_zp * sum_w); // dequantization
           output_val /= count; // Average pooling
 
-          output[index] = at::native::quantize_val<T>(input_scale, input_zp, output_val);
+          output[index] =
+              at::native::quantize_val<T>(input_scale, input_zp, output_val);
         } // for pw
       } // for ph
     } // for c
@@ -147,28 +163,29 @@ at::Tensor qroi_align_forward_kernel(
   // FIXME: This is private, API might change:
   // https://github.com/pytorch/pytorch/wiki/Introducing-Quantized-Tensor#quantized-tensor-apis
   at::Tensor output = at::_empty_affine_quantized(
-      {num_rois, channels, pooled_height, pooled_width}, input.options(),
-       input.q_scale(), input.q_zero_point());
+      {num_rois, channels, pooled_height, pooled_width},
+      input.options(),
+      input.q_scale(),
+      input.q_zero_point());
 
   if (output.numel() == 0)
     return output;
 
-  AT_DISPATCH_QINT_TYPES(
-      input.scalar_type(), "qroi_align_forward_kernel", [&] {
-        qroi_align_forward_kernel_impl<scalar_t>(
-            num_rois,
-            input,
-            spatial_scale,
-            channels,
-            height,
-            width,
-            pooled_height,
-            pooled_width,
-            sampling_ratio,
-            aligned,
-            rois,
-            output.data_ptr<scalar_t>());
-      });
+  AT_DISPATCH_QINT_TYPES(input.scalar_type(), "qroi_align_forward_kernel", [&] {
+    qroi_align_forward_kernel_impl<scalar_t>(
+        num_rois,
+        input,
+        spatial_scale,
+        channels,
+        height,
+        width,
+        pooled_height,
+        pooled_width,
+        sampling_ratio,
+        aligned,
+        rois,
+        output.data_ptr<scalar_t>());
+  });
   return output;
 }
 

From fb45472c9e0de9e97991b406a248257ca9d38cbd Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nicolashug@fb.com>
Date: Sat, 3 Apr 2021 17:49:58 +0100
Subject: [PATCH 10/26] again

---
 torchvision/csrc/ops/roi_align.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torchvision/csrc/ops/roi_align.h b/torchvision/csrc/ops/roi_align.h
index 20c5684c87c..61775e4559d 100644
--- a/torchvision/csrc/ops/roi_align.h
+++ b/torchvision/csrc/ops/roi_align.h
@@ -45,7 +45,7 @@ struct PreCalc {
 // This helper computes the interpolation weights (w1, w2...) for every sampling
 // point of a given box. There are pool_height * pool_width * roi_bin_grid_h *
 // roi_bin_grid_w such sampling points.
-// 
+//
 // The weights (w1, w2...) are computed as the areas in this figure:
 // https://en.wikipedia.org/wiki/Bilinear_interpolation#/media/File:Bilinear_interpolation_visualisation.svg
 // and pos1, pos2 etc correspond to the indices of their respective pixels.

From 79bdfdf010a48067a89a49fa0c5521b1fbd2aadb Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nicolashug@fb.com>
Date: Sun, 4 Apr 2021 10:25:56 +0100
Subject: [PATCH 11/26] more correct test. Had to remove optimization although
 it almost works

---
 test/test_ops.py                              | 40 +++++++++++++++----
 .../ops/quantized/cpu/qroi_align_kernel.cpp   | 36 ++++++++++++-----
 2 files changed, 57 insertions(+), 19 deletions(-)

diff --git a/test/test_ops.py b/test/test_ops.py
index fab324b2f3d..8d3c322083e 100644
--- a/test/test_ops.py
+++ b/test/test_ops.py
@@ -313,21 +313,45 @@ def make_rois(num_rois=1000):
             rois[:, 3:] += rois[:, 1:3]  # make sure boxes aren't degenerate
             return rois
 
-        for scale, zero_point in ((1, 0), (2, 10)):
+        for scale, zero_point in ((1, 0), (2, 10), (0.1, 50)):
             for qdtype in (torch.qint8, torch.quint8, torch.qint32):
 
-                x = torch.randint(0, 100, size=(num_batches, n_channels, img_size, img_size)).to(dtype)
+                x = torch.randint(50, 100, size=(num_batches, n_channels, img_size, img_size)).to(dtype)
                 qx = torch.quantize_per_tensor(x, scale=scale, zero_point=zero_point, dtype=qdtype)
 
                 rois = make_rois()
                 qrois = torch.quantize_per_tensor(rois, scale=scale, zero_point=zero_point, dtype=qdtype)
 
-                x, rois = qx.dequantize(), qrois.dequantize()
-
-                y = ops.roi_align(x, rois, output_size=pool_size, spatial_scale=1, sampling_ratio=-1)
-                qy = ops.roi_align(qx, qrois, output_size=pool_size, spatial_scale=1, sampling_ratio=-1)
-
-                self.assertTrue(torch.allclose(y, qy.dequantize(), atol=1))
+                x, rois = qx.dequantize(), qrois.dequantize()  # we want to pass the same inputs
+
+                y = ops.roi_align(
+                    x,
+                    rois,
+                    output_size=pool_size,
+                    spatial_scale=1,
+                    sampling_ratio=-1,
+                    # aligned=aligned,
+                )
+                qy = ops.roi_align(
+                    qx,
+                    qrois,
+                    output_size=pool_size,
+                    spatial_scale=1,
+                    sampling_ratio=-1,
+                    # aligned=aligned,
+                )
+
+                # The output qy is itself a quantized tensor and there might have been a loss of info when it was
+                # quantized. For a fair comparison we need to quantize y as well
+                quantized_float_y = torch.quantize_per_tensor(y, scale=scale, zero_point=zero_point, dtype=qdtype)
+                n_diff = (quantized_float_y != qy).sum()
+                diff = torch.abs((quantized_float_y.dequantize() - qy.dequantize())).sum()
+                self.assertTrue((qy == quantized_float_y).all(), f"{scale}, {zero_point}, {qdtype}, {n_diff}, {diff},")
+
+                if (scale, zero_point) == (1, 0):
+                    # in this case we can assert strict equality as the requantization of the output was the
+                    # identity
+                    self.assertTrue((qy.dequantize() == y.round()).all())
 
 
 class PSRoIAlignTester(RoIOpTester, unittest.TestCase):
diff --git a/torchvision/csrc/ops/quantized/cpu/qroi_align_kernel.cpp b/torchvision/csrc/ops/quantized/cpu/qroi_align_kernel.cpp
index 81c14f625d2..10c62cf78b3 100644
--- a/torchvision/csrc/ops/quantized/cpu/qroi_align_kernel.cpp
+++ b/torchvision/csrc/ops/quantized/cpu/qroi_align_kernel.cpp
@@ -109,25 +109,39 @@ void qroi_align_forward_kernel_impl(
           int index = index_n_c + ph * pooled_width + pw;
 
           float output_val = 0.;
-          float sum_w = 0.;
+          // float sum_w = 0.;
           for (int iy = 0; iy < roi_bin_grid_h; iy++) {
             for (int ix = 0; ix < roi_bin_grid_w; ix++) {
               detail::PreCalc<float> pc = pre_calc[pre_calc_index];
 
-              // to optimize computations we use the raw .val_ fields and we'll
-              // dequantize later
-              output_val += pc.w1 * offset_input[pc.pos1].val_ +
-                  pc.w2 * offset_input[pc.pos2].val_ +
-                  pc.w3 * offset_input[pc.pos3].val_ +
-                  pc.w4 * offset_input[pc.pos4].val_;
-
-              sum_w += pc.w1 + pc.w2 + pc.w3 + pc.w4;
+              output_val += pc.w1 *
+                      at::native::dequantize_val(
+                                input_scale, input_zp, offset_input[pc.pos1]) +
+                  pc.w2 *
+                      at::native::dequantize_val(
+                          input_scale, input_zp, offset_input[pc.pos2]) +
+                  pc.w3 *
+                      at::native::dequantize_val(
+                          input_scale, input_zp, offset_input[pc.pos3]) +
+                  pc.w4 *
+                      at::native::dequantize_val(
+                          input_scale, input_zp, offset_input[pc.pos4]);
+
+              // FIXME: Possible optimization. Unfortunately the tests fail
+              // on some (few) inputs: Python rounds up while the C++ code
+              // rounds down (or the other way around).
+              // output_val += pc.w1 * offset_input[pc.pos1].val_ +
+              //     pc.w2 * offset_input[pc.pos2].val_ +
+              //     pc.w3 * offset_input[pc.pos3].val_ +
+              //     pc.w4 * offset_input[pc.pos4].val_;
+              // sum_w += pc.w1 + pc.w2 + pc.w3 + pc.w4;
+              // And then dequantize later, just before averaging:
+              // output_val = input_scale * (output_val - (float)input_zp *
+              // sum_w);
 
               pre_calc_index += 1;
             }
           }
-          output_val =
-              input_scale * (output_val - input_zp * sum_w); // dequantization
           output_val /= count; // Average pooling
 
           output[index] =

From 3dccacaaa85f07d39d4c267439ae1f56f11e4be7 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nicolashug@fb.com>
Date: Sun, 4 Apr 2021 11:31:06 +0100
Subject: [PATCH 12/26] Also test aligned=True

---
 test/test_ops.py | 82 +++++++++++++++++++++++++-----------------------
 1 file changed, 43 insertions(+), 39 deletions(-)

diff --git a/test/test_ops.py b/test/test_ops.py
index 8d3c322083e..6b3b085a81e 100644
--- a/test/test_ops.py
+++ b/test/test_ops.py
@@ -313,45 +313,49 @@ def make_rois(num_rois=1000):
             rois[:, 3:] += rois[:, 1:3]  # make sure boxes aren't degenerate
             return rois
 
-        for scale, zero_point in ((1, 0), (2, 10), (0.1, 50)):
-            for qdtype in (torch.qint8, torch.quint8, torch.qint32):
-
-                x = torch.randint(50, 100, size=(num_batches, n_channels, img_size, img_size)).to(dtype)
-                qx = torch.quantize_per_tensor(x, scale=scale, zero_point=zero_point, dtype=qdtype)
-
-                rois = make_rois()
-                qrois = torch.quantize_per_tensor(rois, scale=scale, zero_point=zero_point, dtype=qdtype)
-
-                x, rois = qx.dequantize(), qrois.dequantize()  # we want to pass the same inputs
-
-                y = ops.roi_align(
-                    x,
-                    rois,
-                    output_size=pool_size,
-                    spatial_scale=1,
-                    sampling_ratio=-1,
-                    # aligned=aligned,
-                )
-                qy = ops.roi_align(
-                    qx,
-                    qrois,
-                    output_size=pool_size,
-                    spatial_scale=1,
-                    sampling_ratio=-1,
-                    # aligned=aligned,
-                )
-
-                # The output qy is itself a quantized tensor and there might have been a loss of info when it was
-                # quantized. For a fair comparison we need to quantize y as well
-                quantized_float_y = torch.quantize_per_tensor(y, scale=scale, zero_point=zero_point, dtype=qdtype)
-                n_diff = (quantized_float_y != qy).sum()
-                diff = torch.abs((quantized_float_y.dequantize() - qy.dequantize())).sum()
-                self.assertTrue((qy == quantized_float_y).all(), f"{scale}, {zero_point}, {qdtype}, {n_diff}, {diff},")
-
-                if (scale, zero_point) == (1, 0):
-                    # in this case we can assert strict equality as the requantization of the output was the
-                    # identity
-                    self.assertTrue((qy.dequantize() == y.round()).all())
+        for aligned in (True, False):
+            for scale, zero_point in ((1, 0), (2, 10), (0.1, 50)):
+                for qdtype in (torch.qint8, torch.quint8, torch.qint32):
+
+                    x = torch.randint(50, 100, size=(num_batches, n_channels, img_size, img_size)).to(dtype)
+                    qx = torch.quantize_per_tensor(x, scale=scale, zero_point=zero_point, dtype=qdtype)
+
+                    rois = make_rois()
+                    qrois = torch.quantize_per_tensor(rois, scale=scale, zero_point=zero_point, dtype=qdtype)
+
+                    x, rois = qx.dequantize(), qrois.dequantize()  # we want to pass the same inputs
+
+                    y = ops.roi_align(
+                        x,
+                        rois,
+                        output_size=pool_size,
+                        spatial_scale=1,
+                        sampling_ratio=-1,
+                        aligned=aligned,
+                    )
+                    qy = ops.roi_align(
+                        qx,
+                        qrois,
+                        output_size=pool_size,
+                        spatial_scale=1,
+                        sampling_ratio=-1,
+                        aligned=aligned,
+                    )
+
+                    # The output qy is itself a quantized tensor and there might have been a loss of info when it was
+                    # quantized. For a fair comparison we need to quantize y as well
+                    quantized_float_y = torch.quantize_per_tensor(y, scale=scale, zero_point=zero_point, dtype=qdtype)
+                    n_diff = (quantized_float_y != qy).sum()
+                    diff = torch.abs((quantized_float_y.dequantize() - qy.dequantize())).sum()
+                    self.assertTrue(
+                        (qy == quantized_float_y).all(),
+                        f"{scale}, {zero_point}, {qdtype}, {n_diff}, {diff},",
+                    )
+
+                    if (scale, zero_point) == (1, 0):
+                        # in this case we can assert strict equality as the requantization of the output was the
+                        # identity
+                        self.assertTrue((qy.dequantize() == y.round()).all())
 
 
 class PSRoIAlignTester(RoIOpTester, unittest.TestCase):

From c0b13fd4cb668e79d12054e2ded9ad9d3c73ce68 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nicolashug@fb.com>
Date: Sun, 4 Apr 2021 11:40:21 +0100
Subject: [PATCH 13/26] remove useless part

---
 test/test_ops.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/test/test_ops.py b/test/test_ops.py
index 6b3b085a81e..8d7c1c972a6 100644
--- a/test/test_ops.py
+++ b/test/test_ops.py
@@ -352,11 +352,6 @@ def make_rois(num_rois=1000):
                         f"{scale}, {zero_point}, {qdtype}, {n_diff}, {diff},",
                     )
 
-                    if (scale, zero_point) == (1, 0):
-                        # in this case we can assert strict equality as the requantization of the output was the
-                        # identity
-                        self.assertTrue((qy.dequantize() == y.round()).all())
-
 
 class PSRoIAlignTester(RoIOpTester, unittest.TestCase):
     def fn(self, x, rois, pool_h, pool_w, spatial_scale=1, sampling_ratio=-1, **kwargs):

From 8527755c0aa307632aa66726c76c373362d8c298 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nicolashug@fb.com>
Date: Sun, 4 Apr 2021 11:44:57 +0100
Subject: [PATCH 14/26] more docs and comments

---
 torchvision/csrc/ops/roi_align.h | 2 +-
 torchvision/ops/roi_align.py     | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/torchvision/csrc/ops/roi_align.h b/torchvision/csrc/ops/roi_align.h
index 61775e4559d..db14e896fe6 100644
--- a/torchvision/csrc/ops/roi_align.h
+++ b/torchvision/csrc/ops/roi_align.h
@@ -128,7 +128,7 @@ void pre_calc_for_bilinear_interpolate(
           T hy = 1. - ly, hx = 1. - lx;
           T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
 
-          // save weights and indeces
+          // save weights and indices
           PreCalc<T> pc;
           pc.pos1 = y_low * width + x_low;
           pc.pos2 = y_low * width + x_high;
diff --git a/torchvision/ops/roi_align.py b/torchvision/ops/roi_align.py
index c062c6f91f3..0915aa07fa6 100644
--- a/torchvision/ops/roi_align.py
+++ b/torchvision/ops/roi_align.py
@@ -28,7 +28,7 @@ def roi_align(
             then the first column should contain the batch index. If a list of Tensors
             is passed, then each Tensor will correspond to the boxes for an element i
             in a batch.
-        output_size (int or Tuple[int, int]): the size of the output after the pooling
+        output_size (int or Tuple[int, int]): the size of the output (in bins or pixels) after the pooling
             is performed, as (height, width).
         spatial_scale (float): a scaling factor that maps the input coordinates to
             the box coordinates. Default: 1.0
@@ -38,8 +38,8 @@ def roi_align(
             <= 0, then an adaptive number of grid points are used (computed as
             ``ceil(roi_width / output_width)``, and likewise for height). Default: -1
         aligned (bool): If False, use the legacy implementation.
-            If True, pixel shift it by -0.5 for align more perfectly about two neighboring pixel indices.
-            This version is used in Detectron2
+            If True, pixel shift the box coordinates it by -0.5 for a better alignment with the two
+            neighboring pixel indices. This version is used in Detectron2
 
     Returns:
         output (Tensor[K, C, output_size[0], output_size[1]])

From efef48af7868a9cf88c5aeb381ee32cc7ba8002e Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nicolashug@fb.com>
Date: Mon, 5 Apr 2021 10:53:44 +0100
Subject: [PATCH 15/26] Put back optimization with more robust test

---
 test/test_ops.py                              | 24 +++++++++----
 .../ops/quantized/cpu/qroi_align_kernel.cpp   | 36 ++++++-------------
 2 files changed, 29 insertions(+), 31 deletions(-)

diff --git a/test/test_ops.py b/test/test_ops.py
index 8d7c1c972a6..4a73fe8ee74 100644
--- a/test/test_ops.py
+++ b/test/test_ops.py
@@ -345,12 +345,24 @@ def make_rois(num_rois=1000):
                     # The output qy is itself a quantized tensor and there might have been a loss of info when it was
                     # quantized. For a fair comparison we need to quantize y as well
                     quantized_float_y = torch.quantize_per_tensor(y, scale=scale, zero_point=zero_point, dtype=qdtype)
-                    n_diff = (quantized_float_y != qy).sum()
-                    diff = torch.abs((quantized_float_y.dequantize() - qy.dequantize())).sum()
-                    self.assertTrue(
-                        (qy == quantized_float_y).all(),
-                        f"{scale}, {zero_point}, {qdtype}, {n_diff}, {diff},",
-                    )
+
+                    try:
+                        # Ideally, we would assert this, which passes with (scale, zero) == (1, 0)
+                        self.assertTrue((qy == quantized_float_y).all())
+                    except AssertionError:
+                        # But because the computation aren't exactly the same between the 2 RoIAlign procedures, some
+                        # rounding error may lead to a difference of 2 in the output.
+                        # For example with (scale, zero) = (2, 10), 45.00000... will be quantized to 44
+                        # but 45.00000001 will be rounded to 46. We make sure below that:
+                        # - such discrepancies between qy and quantized_float_y are very rare (less then 5%)
+                        # - any difference between qy and quantized_float_y is == scale
+                        diff_idx = torch.where(qy != quantized_float_y)
+                        num_diff = diff_idx[0].numel()
+                        self.assertTrue(num_diff / qy.numel() < .05)
+
+                        abs_diff = torch.abs(qy[diff_idx].dequantize() - quantized_float_y[diff_idx].dequantize())
+                        t_scale = torch.full_like(abs_diff, fill_value=scale)
+                        self.assertTrue(torch.allclose(abs_diff, t_scale, atol=1e-5))
 
 
 class PSRoIAlignTester(RoIOpTester, unittest.TestCase):
diff --git a/torchvision/csrc/ops/quantized/cpu/qroi_align_kernel.cpp b/torchvision/csrc/ops/quantized/cpu/qroi_align_kernel.cpp
index 10c62cf78b3..ad5a7f6166b 100644
--- a/torchvision/csrc/ops/quantized/cpu/qroi_align_kernel.cpp
+++ b/torchvision/csrc/ops/quantized/cpu/qroi_align_kernel.cpp
@@ -109,39 +109,25 @@ void qroi_align_forward_kernel_impl(
           int index = index_n_c + ph * pooled_width + pw;
 
           float output_val = 0.;
-          // float sum_w = 0.;
+          float sum_w = 0.;
           for (int iy = 0; iy < roi_bin_grid_h; iy++) {
             for (int ix = 0; ix < roi_bin_grid_w; ix++) {
               detail::PreCalc<float> pc = pre_calc[pre_calc_index];
 
-              output_val += pc.w1 *
-                      at::native::dequantize_val(
-                                input_scale, input_zp, offset_input[pc.pos1]) +
-                  pc.w2 *
-                      at::native::dequantize_val(
-                          input_scale, input_zp, offset_input[pc.pos2]) +
-                  pc.w3 *
-                      at::native::dequantize_val(
-                          input_scale, input_zp, offset_input[pc.pos3]) +
-                  pc.w4 *
-                      at::native::dequantize_val(
-                          input_scale, input_zp, offset_input[pc.pos4]);
-
-              // FIXME: Possible optimization. Unfortunately the tests fail
-              // on some (few) inputs: Python rounds up while the C++ code
-              // rounds down (or the other way around).
-              // output_val += pc.w1 * offset_input[pc.pos1].val_ +
-              //     pc.w2 * offset_input[pc.pos2].val_ +
-              //     pc.w3 * offset_input[pc.pos3].val_ +
-              //     pc.w4 * offset_input[pc.pos4].val_;
-              // sum_w += pc.w1 + pc.w2 + pc.w3 + pc.w4;
-              // And then dequantize later, just before averaging:
-              // output_val = input_scale * (output_val - (float)input_zp *
-              // sum_w);
+              // Optimization: we use the raw values here and we'll dequantize
+              // later
+              output_val += pc.w1 * offset_input[pc.pos1].val_ +
+                  pc.w2 * offset_input[pc.pos2].val_ +
+                  pc.w3 * offset_input[pc.pos3].val_ +
+                  pc.w4 * offset_input[pc.pos4].val_;
+              sum_w += pc.w1 + pc.w2 + pc.w3 + pc.w4;
 
               pre_calc_index += 1;
             }
           }
+          // Dequantize here
+          output_val = input_scale * (output_val - (float)input_zp * sum_w);
+
           output_val /= count; // Average pooling
 
           output[index] =

From d6f78ab0af9ef3f859733645191b5ebfb57dd1ba Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nicolashug@fb.com>
Date: Wed, 7 Apr 2021 10:53:10 +0100
Subject: [PATCH 16/26] Added check for index upper bound

---
 test/test_ops.py                                    | 13 ++++++++++---
 .../csrc/ops/quantized/cpu/qroi_align_kernel.cpp    | 13 +++++++++++--
 2 files changed, 21 insertions(+), 5 deletions(-)

diff --git a/test/test_ops.py b/test/test_ops.py
index 4a73fe8ee74..a0943d48687 100644
--- a/test/test_ops.py
+++ b/test/test_ops.py
@@ -304,12 +304,12 @@ def test_qroialign(self):
         pool_size = 5
         img_size = 10
         n_channels = 2
-        num_batches = 2
+        num_imgs = 2
         dtype = torch.float
 
         def make_rois(num_rois=1000):
             rois = torch.randint(0, img_size // 2, size=(num_rois, 5)).to(dtype)
-            rois[:, 0] = torch.randint(0, num_batches, size=(num_rois,))  # set batch index
+            rois[:, 0] = torch.randint(0, num_imgs, size=(num_rois,))  # set batch index
             rois[:, 3:] += rois[:, 1:3]  # make sure boxes aren't degenerate
             return rois
 
@@ -317,7 +317,7 @@ def make_rois(num_rois=1000):
             for scale, zero_point in ((1, 0), (2, 10), (0.1, 50)):
                 for qdtype in (torch.qint8, torch.quint8, torch.qint32):
 
-                    x = torch.randint(50, 100, size=(num_batches, n_channels, img_size, img_size)).to(dtype)
+                    x = torch.randint(50, 100, size=(num_imgs, n_channels, img_size, img_size)).to(dtype)
                     qx = torch.quantize_per_tensor(x, scale=scale, zero_point=zero_point, dtype=qdtype)
 
                     rois = make_rois()
@@ -364,6 +364,13 @@ def make_rois(num_rois=1000):
                         t_scale = torch.full_like(abs_diff, fill_value=scale)
                         self.assertTrue(torch.allclose(abs_diff, t_scale, atol=1e-5))
 
+        x = torch.randint(50, 100, size=(129, 3, 10, 10)).to(dtype)
+        qx = torch.quantize_per_tensor(x, scale=0, zero_point=1, dtype=torch.qint8)
+        rois = make_rois(10)
+        qrois = torch.quantize_per_tensor(rois, scale=0, zero_point=1, dtype=torch.qint8)
+        with self.assertRaisesRegex(RuntimeError, "There are 129 input images in the batch, but the RoIs tensor"):
+            ops.roi_align(qx, qrois, output_size=pool_size)
+
 
 class PSRoIAlignTester(RoIOpTester, unittest.TestCase):
     def fn(self, x, rois, pool_h, pool_w, spatial_scale=1, sampling_ratio=-1, **kwargs):
diff --git a/torchvision/csrc/ops/quantized/cpu/qroi_align_kernel.cpp b/torchvision/csrc/ops/quantized/cpu/qroi_align_kernel.cpp
index ad5a7f6166b..7b88cbe2b9b 100644
--- a/torchvision/csrc/ops/quantized/cpu/qroi_align_kernel.cpp
+++ b/torchvision/csrc/ops/quantized/cpu/qroi_align_kernel.cpp
@@ -36,8 +36,7 @@ void qroi_align_forward_kernel_impl(
 
     const T* offset_rois = rois + n * 5;
     int roi_batch_ind = at::native::dequantize_val(
-        rois_scale, rois_zp, offset_rois[0]); // FIXME: This can be out of the
-                                              // range of the quantized type!!
+        rois_scale, rois_zp, offset_rois[0]);
 
     // Do not using rounding; this implementation detail is critical
     float offset = aligned ? 0.5 : 0.;
@@ -172,6 +171,16 @@ at::Tensor qroi_align_forward_kernel(
     return output;
 
   AT_DISPATCH_QINT_TYPES(input.scalar_type(), "qroi_align_forward_kernel", [&] {
+    // Note: q_max relates to the input tensor, but we need that of the rois
+    // tensor. They're the same since we make sure rois and input have the same
+    // type above.
+    uint64_t max_indexable = std::numeric_limits<underlying_t>::max() + 1;
+    std::string err_msg = "There are " + std::to_string(input.size(0)) +
+        " input images in the batch, but the RoIs tensor can only index up to " +
+        std::to_string(max_indexable) +
+        " images. Try to reduce the batch size.";
+    TORCH_CHECK(input.size(0) <= max_indexable, err_msg);
+
     qroi_align_forward_kernel_impl<scalar_t>(
         num_rois,
         input,

From 61564ca8183f9406c6e019a151c731d232191a89 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nicolashug@fb.com>
Date: Wed, 7 Apr 2021 11:31:46 +0100
Subject: [PATCH 17/26] avoid possible overflow

---
 torchvision/csrc/ops/quantized/cpu/qroi_align_kernel.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/torchvision/csrc/ops/quantized/cpu/qroi_align_kernel.cpp b/torchvision/csrc/ops/quantized/cpu/qroi_align_kernel.cpp
index 7b88cbe2b9b..012c897cb6e 100644
--- a/torchvision/csrc/ops/quantized/cpu/qroi_align_kernel.cpp
+++ b/torchvision/csrc/ops/quantized/cpu/qroi_align_kernel.cpp
@@ -174,12 +174,12 @@ at::Tensor qroi_align_forward_kernel(
     // Note: q_max relates to the input tensor, but we need that of the rois
     // tensor. They're the same since we make sure rois and input have the same
     // type above.
-    uint64_t max_indexable = std::numeric_limits<underlying_t>::max() + 1;
+    int64_t q_max = std::numeric_limits<underlying_t>::max();
     std::string err_msg = "There are " + std::to_string(input.size(0)) +
         " input images in the batch, but the RoIs tensor can only index up to " +
-        std::to_string(max_indexable) +
+        std::to_string(q_max + 1) +
         " images. Try to reduce the batch size.";
-    TORCH_CHECK(input.size(0) <= max_indexable, err_msg);
+    TORCH_CHECK(input.size(0) - 1 <= q_max, err_msg);
 
     qroi_align_forward_kernel_impl<scalar_t>(
         num_rois,

From 369fd33d9c62e6369acac96f2c0cd5eb6f79cc50 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nicolashug@fb.com>
Date: Wed, 7 Apr 2021 12:00:54 +0100
Subject: [PATCH 18/26] Move common function into common.h

---
 torchvision/csrc/ops/cpu/roi_align_common.h   | 118 ++++++++++++++++++
 torchvision/csrc/ops/cpu/roi_align_kernel.cpp |   2 +-
 .../ops/quantized/cpu/qroi_align_kernel.cpp   |   2 +-
 torchvision/csrc/ops/roi_align.h              | 116 -----------------
 4 files changed, 120 insertions(+), 118 deletions(-)
 create mode 100644 torchvision/csrc/ops/cpu/roi_align_common.h

diff --git a/torchvision/csrc/ops/cpu/roi_align_common.h b/torchvision/csrc/ops/cpu/roi_align_common.h
new file mode 100644
index 00000000000..c34762f8222
--- /dev/null
+++ b/torchvision/csrc/ops/cpu/roi_align_common.h
@@ -0,0 +1,118 @@
+#pragma once
+
+#include <ATen/ATen.h>
+
+namespace vision {
+namespace ops {
+namespace detail {
+
+template <typename T>
+struct PreCalc {
+  int pos1;
+  int pos2;
+  int pos3;
+  int pos4;
+  T w1;
+  T w2;
+  T w3;
+  T w4;
+};
+
+template <typename T>
+void pre_calc_for_bilinear_interpolate(
+    int height,
+    int width,
+    int pooled_height,
+    int pooled_width,
+    T roi_start_h,
+    T roi_start_w,
+    T bin_size_h,
+    T bin_size_w,
+    int roi_bin_grid_h,
+    int roi_bin_grid_w,
+    std::vector<PreCalc<T>>& pre_calc);
+  int pre_calc_index = 0;
+  for (int ph = 0; ph < pooled_height; ph++) {
+    for (int pw = 0; pw < pooled_width; pw++) {
+      for (int iy = 0; iy < roi_bin_grid_h; iy++) {
+        const T yy = roi_start_h + ph * bin_size_h +
+            static_cast<T>(iy + .5f) * bin_size_h /
+                static_cast<T>(roi_bin_grid_h); // e.g., 0.5, 1.5
+        for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+          const T xx = roi_start_w + pw * bin_size_w +
+              static_cast<T>(ix + .5f) * bin_size_w /
+                  static_cast<T>(roi_bin_grid_w);
+
+          T x = xx;
+          T y = yy;
+          // deal with: inverse elements are out of feature map boundary
+          if (y < -1.0 || y > height || x < -1.0 || x > width) {
+            // empty
+            PreCalc<T> pc;
+            pc.pos1 = 0;
+            pc.pos2 = 0;
+            pc.pos3 = 0;
+            pc.pos4 = 0;
+            pc.w1 = 0;
+            pc.w2 = 0;
+            pc.w3 = 0;
+            pc.w4 = 0;
+            pre_calc[pre_calc_index] = pc;
+            pre_calc_index += 1;
+            continue;
+          }
+
+          if (y <= 0) {
+            y = 0;
+          }
+          if (x <= 0) {
+            x = 0;
+          }
+
+          int y_low = (int)y;
+          int x_low = (int)x;
+          int y_high;
+          int x_high;
+
+          if (y_low >= height - 1) {
+            y_high = y_low = height - 1;
+            y = (T)y_low;
+          } else {
+            y_high = y_low + 1;
+          }
+
+          if (x_low >= width - 1) {
+            x_high = x_low = width - 1;
+            x = (T)x_low;
+          } else {
+            x_high = x_low + 1;
+          }
+
+          T ly = y - y_low;
+          T lx = x - x_low;
+          T hy = 1. - ly, hx = 1. - lx;
+          T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+
+          // save weights and indices
+          PreCalc<T> pc;
+          pc.pos1 = y_low * width + x_low;
+          pc.pos2 = y_low * width + x_high;
+          pc.pos3 = y_high * width + x_low;
+          pc.pos4 = y_high * width + x_high;
+          pc.w1 = w1;
+          pc.w2 = w2;
+          pc.w3 = w3;
+          pc.w4 = w4;
+          pre_calc[pre_calc_index] = pc;
+
+          pre_calc_index += 1;
+        }
+      }
+    }
+  }
+}
+
+
+} // namespace detail
+} // namespace ops
+} // namespace vision
\ No newline at end of file
diff --git a/torchvision/csrc/ops/cpu/roi_align_kernel.cpp b/torchvision/csrc/ops/cpu/roi_align_kernel.cpp
index 619064cb40e..e6684e953d0 100644
--- a/torchvision/csrc/ops/cpu/roi_align_kernel.cpp
+++ b/torchvision/csrc/ops/cpu/roi_align_kernel.cpp
@@ -1,7 +1,7 @@
 #include <ATen/ATen.h>
 #include <torch/library.h>
 
-#include "../roi_align.h"
+#include "./roi_align_common.h"
 
 namespace vision {
 namespace ops {
diff --git a/torchvision/csrc/ops/quantized/cpu/qroi_align_kernel.cpp b/torchvision/csrc/ops/quantized/cpu/qroi_align_kernel.cpp
index 012c897cb6e..6143948525e 100644
--- a/torchvision/csrc/ops/quantized/cpu/qroi_align_kernel.cpp
+++ b/torchvision/csrc/ops/quantized/cpu/qroi_align_kernel.cpp
@@ -2,7 +2,7 @@
 #include <ATen/native/quantized/affine_quantizer.h>
 #include <torch/library.h>
 
-#include "../../roi_align.h"
+#include "../../cpu/roi_align_common.h"
 
 namespace vision {
 namespace ops {
diff --git a/torchvision/csrc/ops/roi_align.h b/torchvision/csrc/ops/roi_align.h
index db14e896fe6..6a1cca35a3d 100644
--- a/torchvision/csrc/ops/roi_align.h
+++ b/torchvision/csrc/ops/roi_align.h
@@ -30,122 +30,6 @@ at::Tensor _roi_align_backward(
     int64_t sampling_ratio,
     bool aligned);
 
-template <typename T>
-struct PreCalc {
-  int pos1;
-  int pos2;
-  int pos3;
-  int pos4;
-  T w1;
-  T w2;
-  T w3;
-  T w4;
-};
-
-// This helper computes the interpolation weights (w1, w2...) for every sampling
-// point of a given box. There are pool_height * pool_width * roi_bin_grid_h *
-// roi_bin_grid_w such sampling points.
-//
-// The weights (w1, w2...) are computed as the areas in this figure:
-// https://en.wikipedia.org/wiki/Bilinear_interpolation#/media/File:Bilinear_interpolation_visualisation.svg
-// and pos1, pos2 etc correspond to the indices of their respective pixels.
-//
-// Note: the weights and indices are shared across all channels, which is why
-// they are pre-calculated prior to the main loop in the RoIAlign kernel.
-// implementation taken from Caffe2
-template <typename T>
-void pre_calc_for_bilinear_interpolate(
-    int height,
-    int width,
-    int pooled_height,
-    int pooled_width,
-    T roi_start_h,
-    T roi_start_w,
-    T bin_size_h,
-    T bin_size_w,
-    int roi_bin_grid_h,
-    int roi_bin_grid_w,
-    std::vector<PreCalc<T>>& pre_calc) {
-  int pre_calc_index = 0;
-  for (int ph = 0; ph < pooled_height; ph++) {
-    for (int pw = 0; pw < pooled_width; pw++) {
-      for (int iy = 0; iy < roi_bin_grid_h; iy++) {
-        const T yy = roi_start_h + ph * bin_size_h +
-            static_cast<T>(iy + .5f) * bin_size_h /
-                static_cast<T>(roi_bin_grid_h); // e.g., 0.5, 1.5
-        for (int ix = 0; ix < roi_bin_grid_w; ix++) {
-          const T xx = roi_start_w + pw * bin_size_w +
-              static_cast<T>(ix + .5f) * bin_size_w /
-                  static_cast<T>(roi_bin_grid_w);
-
-          T x = xx;
-          T y = yy;
-          // deal with: inverse elements are out of feature map boundary
-          if (y < -1.0 || y > height || x < -1.0 || x > width) {
-            // empty
-            PreCalc<T> pc;
-            pc.pos1 = 0;
-            pc.pos2 = 0;
-            pc.pos3 = 0;
-            pc.pos4 = 0;
-            pc.w1 = 0;
-            pc.w2 = 0;
-            pc.w3 = 0;
-            pc.w4 = 0;
-            pre_calc[pre_calc_index] = pc;
-            pre_calc_index += 1;
-            continue;
-          }
-
-          if (y <= 0) {
-            y = 0;
-          }
-          if (x <= 0) {
-            x = 0;
-          }
-
-          int y_low = (int)y;
-          int x_low = (int)x;
-          int y_high;
-          int x_high;
-
-          if (y_low >= height - 1) {
-            y_high = y_low = height - 1;
-            y = (T)y_low;
-          } else {
-            y_high = y_low + 1;
-          }
-
-          if (x_low >= width - 1) {
-            x_high = x_low = width - 1;
-            x = (T)x_low;
-          } else {
-            x_high = x_low + 1;
-          }
-
-          T ly = y - y_low;
-          T lx = x - x_low;
-          T hy = 1. - ly, hx = 1. - lx;
-          T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
-
-          // save weights and indices
-          PreCalc<T> pc;
-          pc.pos1 = y_low * width + x_low;
-          pc.pos2 = y_low * width + x_high;
-          pc.pos3 = y_high * width + x_low;
-          pc.pos4 = y_high * width + x_high;
-          pc.w1 = w1;
-          pc.w2 = w2;
-          pc.w3 = w3;
-          pc.w4 = w4;
-          pre_calc[pre_calc_index] = pc;
-
-          pre_calc_index += 1;
-        }
-      }
-    }
-  }
-}
 
 } // namespace detail
 

From bcadc0f3c27dd9a8b225d2f776a48c151827e265 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nicolashug@fb.com>
Date: Wed, 7 Apr 2021 12:01:41 +0100
Subject: [PATCH 19/26] oops

---
 torchvision/csrc/ops/cpu/roi_align_common.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torchvision/csrc/ops/cpu/roi_align_common.h b/torchvision/csrc/ops/cpu/roi_align_common.h
index c34762f8222..bc3031a58b6 100644
--- a/torchvision/csrc/ops/cpu/roi_align_common.h
+++ b/torchvision/csrc/ops/cpu/roi_align_common.h
@@ -30,7 +30,7 @@ void pre_calc_for_bilinear_interpolate(
     T bin_size_w,
     int roi_bin_grid_h,
     int roi_bin_grid_w,
-    std::vector<PreCalc<T>>& pre_calc);
+    std::vector<PreCalc<T>>& pre_calc){
   int pre_calc_index = 0;
   for (int ph = 0; ph < pooled_height; ph++) {
     for (int pw = 0; pw < pooled_width; pw++) {

From 6792e65955b59038c24785a07dfbc944d51539d7 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nicolashug@fb.com>
Date: Wed, 7 Apr 2021 13:16:23 +0100
Subject: [PATCH 20/26] scale=1,zero_point=0 makes more sense

---
 test/test_ops.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/test_ops.py b/test/test_ops.py
index a0943d48687..d50b6905632 100644
--- a/test/test_ops.py
+++ b/test/test_ops.py
@@ -365,9 +365,9 @@ def make_rois(num_rois=1000):
                         self.assertTrue(torch.allclose(abs_diff, t_scale, atol=1e-5))
 
         x = torch.randint(50, 100, size=(129, 3, 10, 10)).to(dtype)
-        qx = torch.quantize_per_tensor(x, scale=0, zero_point=1, dtype=torch.qint8)
+        qx = torch.quantize_per_tensor(x, scale=1, zero_point=0, dtype=torch.qint8)
         rois = make_rois(10)
-        qrois = torch.quantize_per_tensor(rois, scale=0, zero_point=1, dtype=torch.qint8)
+        qrois = torch.quantize_per_tensor(rois, scale=1, zero_point=0, dtype=torch.qint8)
         with self.assertRaisesRegex(RuntimeError, "There are 129 input images in the batch, but the RoIs tensor"):
             ops.roi_align(qx, qrois, output_size=pool_size)
 

From dde14ed370fd5b192df4fa8c2b684bc90036bb6c Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nicolashug@fb.com>
Date: Wed, 7 Apr 2021 15:01:57 +0100
Subject: [PATCH 21/26] Force batch size of 1 to prevent any indexingbug

---
 test/test_ops.py                              |  6 ++---
 torchvision/csrc/ops/cpu/roi_align_common.h   |  1 -
 .../ops/quantized/cpu/qroi_align_kernel.cpp   | 22 +++++++++----------
 3 files changed, 13 insertions(+), 16 deletions(-)

diff --git a/test/test_ops.py b/test/test_ops.py
index d50b6905632..8c63c9c29c6 100644
--- a/test/test_ops.py
+++ b/test/test_ops.py
@@ -304,7 +304,7 @@ def test_qroialign(self):
         pool_size = 5
         img_size = 10
         n_channels = 2
-        num_imgs = 2
+        num_imgs = 1
         dtype = torch.float
 
         def make_rois(num_rois=1000):
@@ -364,11 +364,11 @@ def make_rois(num_rois=1000):
                         t_scale = torch.full_like(abs_diff, fill_value=scale)
                         self.assertTrue(torch.allclose(abs_diff, t_scale, atol=1e-5))
 
-        x = torch.randint(50, 100, size=(129, 3, 10, 10)).to(dtype)
+        x = torch.randint(50, 100, size=(2, 3, 10, 10)).to(dtype)
         qx = torch.quantize_per_tensor(x, scale=1, zero_point=0, dtype=torch.qint8)
         rois = make_rois(10)
         qrois = torch.quantize_per_tensor(rois, scale=1, zero_point=0, dtype=torch.qint8)
-        with self.assertRaisesRegex(RuntimeError, "There are 129 input images in the batch, but the RoIs tensor"):
+        with self.assertRaisesRegex(RuntimeError, "Only one image per batch is allowed"):
             ops.roi_align(qx, qrois, output_size=pool_size)
 
 
diff --git a/torchvision/csrc/ops/cpu/roi_align_common.h b/torchvision/csrc/ops/cpu/roi_align_common.h
index bc3031a58b6..450db1f13f7 100644
--- a/torchvision/csrc/ops/cpu/roi_align_common.h
+++ b/torchvision/csrc/ops/cpu/roi_align_common.h
@@ -112,7 +112,6 @@ void pre_calc_for_bilinear_interpolate(
   }
 }
 
-
 } // namespace detail
 } // namespace ops
 } // namespace vision
\ No newline at end of file
diff --git a/torchvision/csrc/ops/quantized/cpu/qroi_align_kernel.cpp b/torchvision/csrc/ops/quantized/cpu/qroi_align_kernel.cpp
index 6143948525e..e34b277747e 100644
--- a/torchvision/csrc/ops/quantized/cpu/qroi_align_kernel.cpp
+++ b/torchvision/csrc/ops/quantized/cpu/qroi_align_kernel.cpp
@@ -35,8 +35,9 @@ void qroi_align_forward_kernel_impl(
     int index_n = n * channels * pooled_width * pooled_height;
 
     const T* offset_rois = rois + n * 5;
-    int roi_batch_ind = at::native::dequantize_val(
-        rois_scale, rois_zp, offset_rois[0]);
+
+    // FIXME: change this when batches of size > 1 are allowed
+    const int roi_batch_ind = 0;
 
     // Do not using rounding; this implementation detail is critical
     float offset = aligned ? 0.5 : 0.;
@@ -148,6 +149,13 @@ at::Tensor qroi_align_forward_kernel(
   TORCH_CHECK(input.device().is_cpu(), "input must be a CPU tensor");
   TORCH_CHECK(rois.device().is_cpu(), "rois must be a CPU tensor");
   TORCH_CHECK(rois.size(1) == 5, "rois must have shape as Tensor[K, 5]");
+  // The first column of the RoI tensor is an image index, but not all indices
+  // are representable depending on the quantization. For example 1, 3, 5...
+  // indices can't be represented when qscale is 2. To prevent any bug, we force
+  // a batch size of 1 and we ignore the first column
+  TORCH_CHECK(
+      input.size(0) == 1,
+      "Only one image per batch is allowed in roi_align when quantized tensors are passed.");
 
   at::TensorArg input_t{input, "input", 1}, rois_t{rois, "rois", 2};
 
@@ -171,16 +179,6 @@ at::Tensor qroi_align_forward_kernel(
     return output;
 
   AT_DISPATCH_QINT_TYPES(input.scalar_type(), "qroi_align_forward_kernel", [&] {
-    // Note: q_max relates to the input tensor, but we need that of the rois
-    // tensor. They're the same since we make sure rois and input have the same
-    // type above.
-    int64_t q_max = std::numeric_limits<underlying_t>::max();
-    std::string err_msg = "There are " + std::to_string(input.size(0)) +
-        " input images in the batch, but the RoIs tensor can only index up to " +
-        std::to_string(q_max + 1) +
-        " images. Try to reduce the batch size.";
-    TORCH_CHECK(input.size(0) - 1 <= q_max, err_msg);
-
     qroi_align_forward_kernel_impl<scalar_t>(
         num_rois,
         input,

From 457aab025bd77815602190a30e87e2443c14757c Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nicolashug@fb.com>
Date: Wed, 7 Apr 2021 15:06:47 +0100
Subject: [PATCH 22/26] format

---
 torchvision/csrc/ops/cpu/roi_align_common.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torchvision/csrc/ops/cpu/roi_align_common.h b/torchvision/csrc/ops/cpu/roi_align_common.h
index 450db1f13f7..e59595101ae 100644
--- a/torchvision/csrc/ops/cpu/roi_align_common.h
+++ b/torchvision/csrc/ops/cpu/roi_align_common.h
@@ -30,7 +30,7 @@ void pre_calc_for_bilinear_interpolate(
     T bin_size_w,
     int roi_bin_grid_h,
     int roi_bin_grid_w,
-    std::vector<PreCalc<T>>& pre_calc){
+    std::vector<PreCalc<T>>& pre_calc) {
   int pre_calc_index = 0;
   for (int ph = 0; ph < pooled_height; ph++) {
     for (int pw = 0; pw < pooled_width; pw++) {

From 0c7bb11feb7552f51629b589f97f33b6a9a68d6c Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nicolashug@fb.com>
Date: Wed, 7 Apr 2021 15:43:44 +0100
Subject: [PATCH 23/26] format again

---
 torchvision/csrc/ops/roi_align.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/torchvision/csrc/ops/roi_align.h b/torchvision/csrc/ops/roi_align.h
index 6a1cca35a3d..2ddb6ac3945 100644
--- a/torchvision/csrc/ops/roi_align.h
+++ b/torchvision/csrc/ops/roi_align.h
@@ -30,7 +30,6 @@ at::Tensor _roi_align_backward(
     int64_t sampling_ratio,
     bool aligned);
 
-
 } // namespace detail
 
 } // namespace ops

From e96cf1a029ea91852f0c1d2fe3de56445a7b0973 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nicolashug@fb.com>
Date: Wed, 7 Apr 2021 15:48:06 +0100
Subject: [PATCH 24/26] updated docstring

---
 torchvision/ops/roi_align.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/torchvision/ops/roi_align.py b/torchvision/ops/roi_align.py
index ae4c9dd3678..2e58646151f 100644
--- a/torchvision/ops/roi_align.py
+++ b/torchvision/ops/roi_align.py
@@ -20,14 +20,14 @@ def roi_align(
     Performs Region of Interest (RoI) Align operator with average pooling, as described in Mask R-CNN.
 
     Args:
-        input (Tensor[N, C, H, W]): input tensor
+        input (Tensor[N, C, H, W]): The input tensor, i.e. a batch with ``N`` feature maps
         boxes (Tensor[K, 5] or List[Tensor[L, 4]]): the box coordinates in (x1, y1, x2, y2)
             format where the regions will be taken from.
             The coordinate must satisfy ``0 <= x1 < x2`` and ``0 <= y1 < y2``.
-            If a single Tensor is passed,
-            then the first column should contain the batch index. If a list of Tensors
-            is passed, then each Tensor will correspond to the boxes for an element i
-            in a batch.
+            If a single Tensor is passed, then the first column should
+            contain the index of the corresponding feature map in the batch, i.e. a number in ``[0, N - 1]``.
+            If a list of Tensors is passed, then each Tensor will correspond to the boxes for an element i
+            in the batch.
         output_size (int or Tuple[int, int]): the size of the output (in bins or pixels) after the pooling
             is performed, as (height, width).
         spatial_scale (float): a scaling factor that maps the input coordinates to

From 45d083fc1693b5cd56524a4ab46ed23be42698d6 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nicolashug@fb.com>
Date: Thu, 8 Apr 2021 09:50:51 +0100
Subject: [PATCH 25/26] put back description comment for
 pre_calc_bilinear_interpolate

---
 torchvision/csrc/ops/cpu/roi_align_common.h | 13 ++++++++++++-
 torchvision/ops/roi_align.py                |  3 ++-
 2 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/torchvision/csrc/ops/cpu/roi_align_common.h b/torchvision/csrc/ops/cpu/roi_align_common.h
index e59595101ae..e10c67b5b79 100644
--- a/torchvision/csrc/ops/cpu/roi_align_common.h
+++ b/torchvision/csrc/ops/cpu/roi_align_common.h
@@ -18,6 +18,17 @@ struct PreCalc {
   T w4;
 };
 
+// This helper computes the interpolation weights (w1, w2...) for every sampling
+// point of a given box. There are pool_height * pool_width * roi_bin_grid_h *
+// roi_bin_grid_w such sampling points.
+//
+// The weights (w1, w2...) are computed as the areas in this figure:
+// https://en.wikipedia.org/wiki/Bilinear_interpolation#/media/File:Bilinear_interpolation_visualisation.svg
+// and pos1, pos2 etc correspond to the indices of their respective pixels.
+//
+// Note: the weights and indices are shared across all channels, which is why
+// they are pre-calculated prior to the main loop in the RoIAlign kernel.
+// implementation taken from Caffe2
 template <typename T>
 void pre_calc_for_bilinear_interpolate(
     int height,
@@ -114,4 +125,4 @@ void pre_calc_for_bilinear_interpolate(
 
 } // namespace detail
 } // namespace ops
-} // namespace vision
\ No newline at end of file
+} // namespace vision
diff --git a/torchvision/ops/roi_align.py b/torchvision/ops/roi_align.py
index 2e58646151f..b29101a448c 100644
--- a/torchvision/ops/roi_align.py
+++ b/torchvision/ops/roi_align.py
@@ -20,7 +20,8 @@ def roi_align(
     Performs Region of Interest (RoI) Align operator with average pooling, as described in Mask R-CNN.
 
     Args:
-        input (Tensor[N, C, H, W]): The input tensor, i.e. a batch with ``N`` feature maps
+        input (Tensor[N, C, H, W]): The input tensor, i.e. a batch with ``N`` feature maps.
+            If the tensor is quantized, we expect a batch size of ``N == 1``.
         boxes (Tensor[K, 5] or List[Tensor[L, 4]]): the box coordinates in (x1, y1, x2, y2)
             format where the regions will be taken from.
             The coordinate must satisfy ``0 <= x1 < x2`` and ``0 <= y1 < y2``.

From 3ab6b6657977b9fbb14b7c26feea779ca88ebd8f Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nicolashug@fb.com>
Date: Thu, 8 Apr 2021 09:59:37 +0100
Subject: [PATCH 26/26] revert most changes to docstring as it's taken care of
 in another PR

---
 torchvision/ops/roi_align.py | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/torchvision/ops/roi_align.py b/torchvision/ops/roi_align.py
index b29101a448c..c0ac14329d4 100644
--- a/torchvision/ops/roi_align.py
+++ b/torchvision/ops/roi_align.py
@@ -17,30 +17,30 @@ def roi_align(
     aligned: bool = False,
 ) -> Tensor:
     """
-    Performs Region of Interest (RoI) Align operator with average pooling, as described in Mask R-CNN.
+    Performs Region of Interest (RoI) Align operator described in Mask R-CNN
 
     Args:
-        input (Tensor[N, C, H, W]): The input tensor, i.e. a batch with ``N`` feature maps.
+        input (Tensor[N, C, H, W]): input tensor
             If the tensor is quantized, we expect a batch size of ``N == 1``.
         boxes (Tensor[K, 5] or List[Tensor[L, 4]]): the box coordinates in (x1, y1, x2, y2)
             format where the regions will be taken from.
             The coordinate must satisfy ``0 <= x1 < x2`` and ``0 <= y1 < y2``.
-            If a single Tensor is passed, then the first column should
-            contain the index of the corresponding feature map in the batch, i.e. a number in ``[0, N - 1]``.
-            If a list of Tensors is passed, then each Tensor will correspond to the boxes for an element i
-            in the batch.
-        output_size (int or Tuple[int, int]): the size of the output (in bins or pixels) after the pooling
-            is performed, as (height, width).
+            If a single Tensor is passed,
+            then the first column should contain the batch index. If a list of Tensors
+            is passed, then each Tensor will correspond to the boxes for an element i
+            in a batch
+        output_size (int or Tuple[int, int]): the size of the output after the cropping
+            is performed, as (height, width)
         spatial_scale (float): a scaling factor that maps the input coordinates to
             the box coordinates. Default: 1.0
         sampling_ratio (int): number of sampling points in the interpolation grid
             used to compute the output value of each pooled output bin. If > 0,
-            then exactly ``sampling_ratio x sampling_ratio`` sampling points per bin are used. If
+            then exactly sampling_ratio x sampling_ratio grid points are used. If
             <= 0, then an adaptive number of grid points are used (computed as
-            ``ceil(roi_width / output_width)``, and likewise for height). Default: -1
+            ceil(roi_width / pooled_w), and likewise for height). Default: -1
         aligned (bool): If False, use the legacy implementation.
-            If True, pixel shift the box coordinates it by -0.5 for a better alignment with the two
-            neighboring pixel indices. This version is used in Detectron2
+            If True, pixel shift it by -0.5 for align more perfectly about two neighboring pixel indices.
+            This version in Detectron2
 
     Returns:
         Tensor[K, C, output_size[0], output_size[1]]: The pooled RoIs.