diff --git a/setup.py b/setup.py
index c998118335b..23bbdaab378 100644
--- a/setup.py
+++ b/setup.py
@@ -138,8 +138,11 @@ def get_extensions():
 
     main_file = glob.glob(os.path.join(extensions_dir, '*.cpp')) + glob.glob(os.path.join(extensions_dir, 'ops',
                                                                                           '*.cpp'))
-    source_cpu = glob.glob(os.path.join(extensions_dir, 'ops', 'autograd', '*.cpp')) + glob.glob(
-        os.path.join(extensions_dir, 'ops', 'cpu', '*.cpp'))
+    source_cpu = (
+        glob.glob(os.path.join(extensions_dir, 'ops', 'autograd', '*.cpp')) +
+        glob.glob(os.path.join(extensions_dir, 'ops', 'cpu', '*.cpp')) +
+        glob.glob(os.path.join(extensions_dir, 'ops', 'quantized', 'cpu', '*.cpp'))
+    )
 
     is_rocm_pytorch = False
     if torch.__version__ >= '1.5':
diff --git a/test/test_ops.py b/test/test_ops.py
index 8c938ae0e79..0031da45cce 100644
--- a/test/test_ops.py
+++ b/test/test_ops.py
@@ -418,6 +418,29 @@ def test_nms(self):
         self.assertRaises(RuntimeError, ops.nms, torch.rand(3, 4), torch.rand(3, 2), 0.5)
         self.assertRaises(RuntimeError, ops.nms, torch.rand(3, 4), torch.rand(4), 0.5)
 
+    def test_qnms(self):
+        # Note: we compare qnms vs nms instead of qnms vs reference implementation.
+        # This is because with the int convertion, the trick used in _create_tensors_with_iou
+        # doesn't really work (in fact, nms vs reference implem will also fail with ints)
+        err_msg = 'NMS and QNMS give different results for IoU={}'
+        for iou in [0.2, 0.5, 0.8]:
+            for scale, zero_point in ((1, 0), (2, 50), (3, 10)):
+                boxes, scores = self._create_tensors_with_iou(1000, iou)
+                scores *= 100  # otherwise most scores would be 0 or 1 after int convertion
+
+                qboxes = torch.quantize_per_tensor(boxes, scale=scale, zero_point=zero_point,
+                                                   dtype=torch.quint8)
+                qscores = torch.quantize_per_tensor(scores, scale=scale, zero_point=zero_point,
+                                                    dtype=torch.quint8)
+
+                boxes = qboxes.dequantize()
+                scores = qscores.dequantize()
+
+                keep = ops.nms(boxes, scores, iou)
+                qkeep = ops.nms(qboxes, qscores, iou)
+
+                self.assertTrue(torch.allclose(qkeep, keep), err_msg.format(iou))
+
     @unittest.skipIf(not torch.cuda.is_available(), "CUDA unavailable")
     def test_nms_cuda(self, dtype=torch.float64):
         tol = 1e-3 if dtype is torch.half else 1e-5
diff --git a/torchvision/csrc/ops/quantized/cpu/qnms_kernel.cpp b/torchvision/csrc/ops/quantized/cpu/qnms_kernel.cpp
new file mode 100644
index 00000000000..f7b081327b2
--- /dev/null
+++ b/torchvision/csrc/ops/quantized/cpu/qnms_kernel.cpp
@@ -0,0 +1,129 @@
+#include <ATen/ATen.h>
+#include <ATen/native/quantized/affine_quantizer.h>
+#include <torch/library.h>
+
+namespace vision {
+namespace ops {
+
+namespace {
+
+template <typename scalar_t>
+at::Tensor qnms_kernel_impl(
+    const at::Tensor& dets,
+    const at::Tensor& scores,
+    double iou_threshold) {
+  TORCH_CHECK(!dets.is_cuda(), "dets must be a CPU tensor");
+  TORCH_CHECK(!scores.is_cuda(), "scores must be a CPU tensor");
+  TORCH_CHECK(
+      dets.scalar_type() == scores.scalar_type(),
+      "dets should have the same type as scores");
+
+  if (dets.numel() == 0)
+    return at::empty({0}, dets.options().dtype(at::kLong));
+
+  const auto ndets = dets.size(0);
+
+  auto x1_t = dets.select(1, 0).contiguous();
+  auto y1_t = dets.select(1, 1).contiguous();
+  auto x2_t = dets.select(1, 2).contiguous();
+  auto y2_t = dets.select(1, 3).contiguous();
+  auto order_t = std::get<1>(scores.sort(0, /* descending=*/true));
+  at::Tensor suppressed_t = at::zeros({ndets}, dets.options().dtype(at::kByte));
+  at::Tensor keep_t = at::zeros({ndets}, dets.options().dtype(at::kLong));
+  at::Tensor areas_t = at::zeros({ndets}, dets.options().dtype(at::kFloat));
+
+  auto suppressed = suppressed_t.data_ptr<uint8_t>();
+  auto keep = keep_t.data_ptr<int64_t>();
+  auto order = order_t.data_ptr<int64_t>();
+  auto x1 = x1_t.data_ptr<scalar_t>();
+  auto y1 = y1_t.data_ptr<scalar_t>();
+  auto x2 = x2_t.data_ptr<scalar_t>();
+  auto y2 = y2_t.data_ptr<scalar_t>();
+  auto areas = areas_t.data_ptr<float>();
+
+  for (int64_t i = 0; i < ndets; i++) {
+    // Note 1: To get the exact area we'd need to multiply by scale**2, but this
+    // would get canceled out in the computation of ovr below. So we leave that
+    // out.
+    // Note 2: degenerate boxes (x2 < x1 or y2 < y1) may underflow, although
+    // integral promotion rules will likely prevent it (see
+    // https://stackoverflow.com/questions/32959564/subtraction-of-two-unsigned-gives-signed
+    // for more details).
+    areas[i] = (x2[i].val_ - x1[i].val_) * (y2[i].val_ - y1[i].val_);
+  }
+
+  int64_t num_to_keep = 0;
+
+  for (int64_t _i = 0; _i < ndets; _i++) {
+    auto i = order[_i];
+    if (suppressed[i] == 1)
+      continue;
+    keep[num_to_keep++] = i;
+
+    // We explicitely cast coordinates to float so that the code can be
+    // vectorized.
+    float ix1val = x1[i].val_;
+    float iy1val = y1[i].val_;
+    float ix2val = x2[i].val_;
+    float iy2val = y2[i].val_;
+    float iarea = areas[i];
+
+    for (int64_t _j = _i + 1; _j < ndets; _j++) {
+      auto j = order[_j];
+      if (suppressed[j] == 1)
+        continue;
+      float xx1 = std::max(ix1val, (float)x1[j].val_);
+      float yy1 = std::max(iy1val, (float)y1[j].val_);
+      float xx2 = std::min(ix2val, (float)x2[j].val_);
+      float yy2 = std::min(iy2val, (float)y2[j].val_);
+
+      auto w = std::max(0.f, xx2 - xx1); // * scale (gets canceled below)
+      auto h = std::max(0.f, yy2 - yy1); // * scale (gets canceled below)
+      auto inter = w * h;
+      auto ovr = inter / (iarea + areas[j] - inter);
+      if (ovr > iou_threshold)
+        suppressed[j] = 1;
+    }
+  }
+  return keep_t.narrow(/*dim=*/0, /*start=*/0, /*length=*/num_to_keep);
+}
+
+at::Tensor qnms_kernel(
+    const at::Tensor& dets,
+    const at::Tensor& scores,
+    double iou_threshold) {
+  TORCH_CHECK(
+      dets.dim() == 2, "boxes should be a 2d tensor, got ", dets.dim(), "D");
+  TORCH_CHECK(
+      dets.size(1) == 4,
+      "boxes should have 4 elements in dimension 1, got ",
+      dets.size(1));
+  TORCH_CHECK(
+      scores.dim() == 1,
+      "scores should be a 1d tensor, got ",
+      scores.dim(),
+      "D");
+  TORCH_CHECK(
+      dets.size(0) == scores.size(0),
+      "boxes and scores should have same number of elements in ",
+      "dimension 0, got ",
+      dets.size(0),
+      " and ",
+      scores.size(0));
+
+  auto result = at::empty({0});
+
+  AT_DISPATCH_QINT_TYPES(dets.scalar_type(), "qnms_kernel", [&] {
+    result = qnms_kernel_impl<scalar_t>(dets, scores, iou_threshold);
+  });
+  return result;
+}
+
+} // namespace
+
+TORCH_LIBRARY_IMPL(torchvision, QuantizedCPU, m) {
+  m.impl(TORCH_SELECTIVE_NAME("torchvision::nms"), TORCH_FN(qnms_kernel));
+}
+
+} // namespace ops
+} // namespace vision