diff --git a/setup.py b/setup.py index c998118335b..23bbdaab378 100644 --- a/setup.py +++ b/setup.py @@ -138,8 +138,11 @@ def get_extensions(): main_file = glob.glob(os.path.join(extensions_dir, '*.cpp')) + glob.glob(os.path.join(extensions_dir, 'ops', '*.cpp')) - source_cpu = glob.glob(os.path.join(extensions_dir, 'ops', 'autograd', '*.cpp')) + glob.glob( - os.path.join(extensions_dir, 'ops', 'cpu', '*.cpp')) + source_cpu = ( + glob.glob(os.path.join(extensions_dir, 'ops', 'autograd', '*.cpp')) + + glob.glob(os.path.join(extensions_dir, 'ops', 'cpu', '*.cpp')) + + glob.glob(os.path.join(extensions_dir, 'ops', 'quantized', 'cpu', '*.cpp')) + ) is_rocm_pytorch = False if torch.__version__ >= '1.5': diff --git a/test/test_ops.py b/test/test_ops.py index 8c938ae0e79..0031da45cce 100644 --- a/test/test_ops.py +++ b/test/test_ops.py @@ -418,6 +418,29 @@ def test_nms(self): self.assertRaises(RuntimeError, ops.nms, torch.rand(3, 4), torch.rand(3, 2), 0.5) self.assertRaises(RuntimeError, ops.nms, torch.rand(3, 4), torch.rand(4), 0.5) + def test_qnms(self): + # Note: we compare qnms vs nms instead of qnms vs reference implementation. + # This is because with the int convertion, the trick used in _create_tensors_with_iou + # doesn't really work (in fact, nms vs reference implem will also fail with ints) + err_msg = 'NMS and QNMS give different results for IoU={}' + for iou in [0.2, 0.5, 0.8]: + for scale, zero_point in ((1, 0), (2, 50), (3, 10)): + boxes, scores = self._create_tensors_with_iou(1000, iou) + scores *= 100 # otherwise most scores would be 0 or 1 after int convertion + + qboxes = torch.quantize_per_tensor(boxes, scale=scale, zero_point=zero_point, + dtype=torch.quint8) + qscores = torch.quantize_per_tensor(scores, scale=scale, zero_point=zero_point, + dtype=torch.quint8) + + boxes = qboxes.dequantize() + scores = qscores.dequantize() + + keep = ops.nms(boxes, scores, iou) + qkeep = ops.nms(qboxes, qscores, iou) + + self.assertTrue(torch.allclose(qkeep, keep), err_msg.format(iou)) + @unittest.skipIf(not torch.cuda.is_available(), "CUDA unavailable") def test_nms_cuda(self, dtype=torch.float64): tol = 1e-3 if dtype is torch.half else 1e-5 diff --git a/torchvision/csrc/ops/quantized/cpu/qnms_kernel.cpp b/torchvision/csrc/ops/quantized/cpu/qnms_kernel.cpp new file mode 100644 index 00000000000..f7b081327b2 --- /dev/null +++ b/torchvision/csrc/ops/quantized/cpu/qnms_kernel.cpp @@ -0,0 +1,129 @@ +#include +#include +#include + +namespace vision { +namespace ops { + +namespace { + +template +at::Tensor qnms_kernel_impl( + const at::Tensor& dets, + const at::Tensor& scores, + double iou_threshold) { + TORCH_CHECK(!dets.is_cuda(), "dets must be a CPU tensor"); + TORCH_CHECK(!scores.is_cuda(), "scores must be a CPU tensor"); + TORCH_CHECK( + dets.scalar_type() == scores.scalar_type(), + "dets should have the same type as scores"); + + if (dets.numel() == 0) + return at::empty({0}, dets.options().dtype(at::kLong)); + + const auto ndets = dets.size(0); + + auto x1_t = dets.select(1, 0).contiguous(); + auto y1_t = dets.select(1, 1).contiguous(); + auto x2_t = dets.select(1, 2).contiguous(); + auto y2_t = dets.select(1, 3).contiguous(); + auto order_t = std::get<1>(scores.sort(0, /* descending=*/true)); + at::Tensor suppressed_t = at::zeros({ndets}, dets.options().dtype(at::kByte)); + at::Tensor keep_t = at::zeros({ndets}, dets.options().dtype(at::kLong)); + at::Tensor areas_t = at::zeros({ndets}, dets.options().dtype(at::kFloat)); + + auto suppressed = suppressed_t.data_ptr(); + auto keep = keep_t.data_ptr(); + auto order = order_t.data_ptr(); + auto x1 = x1_t.data_ptr(); + auto y1 = y1_t.data_ptr(); + auto x2 = x2_t.data_ptr(); + auto y2 = y2_t.data_ptr(); + auto areas = areas_t.data_ptr(); + + for (int64_t i = 0; i < ndets; i++) { + // Note 1: To get the exact area we'd need to multiply by scale**2, but this + // would get canceled out in the computation of ovr below. So we leave that + // out. + // Note 2: degenerate boxes (x2 < x1 or y2 < y1) may underflow, although + // integral promotion rules will likely prevent it (see + // https://stackoverflow.com/questions/32959564/subtraction-of-two-unsigned-gives-signed + // for more details). + areas[i] = (x2[i].val_ - x1[i].val_) * (y2[i].val_ - y1[i].val_); + } + + int64_t num_to_keep = 0; + + for (int64_t _i = 0; _i < ndets; _i++) { + auto i = order[_i]; + if (suppressed[i] == 1) + continue; + keep[num_to_keep++] = i; + + // We explicitely cast coordinates to float so that the code can be + // vectorized. + float ix1val = x1[i].val_; + float iy1val = y1[i].val_; + float ix2val = x2[i].val_; + float iy2val = y2[i].val_; + float iarea = areas[i]; + + for (int64_t _j = _i + 1; _j < ndets; _j++) { + auto j = order[_j]; + if (suppressed[j] == 1) + continue; + float xx1 = std::max(ix1val, (float)x1[j].val_); + float yy1 = std::max(iy1val, (float)y1[j].val_); + float xx2 = std::min(ix2val, (float)x2[j].val_); + float yy2 = std::min(iy2val, (float)y2[j].val_); + + auto w = std::max(0.f, xx2 - xx1); // * scale (gets canceled below) + auto h = std::max(0.f, yy2 - yy1); // * scale (gets canceled below) + auto inter = w * h; + auto ovr = inter / (iarea + areas[j] - inter); + if (ovr > iou_threshold) + suppressed[j] = 1; + } + } + return keep_t.narrow(/*dim=*/0, /*start=*/0, /*length=*/num_to_keep); +} + +at::Tensor qnms_kernel( + const at::Tensor& dets, + const at::Tensor& scores, + double iou_threshold) { + TORCH_CHECK( + dets.dim() == 2, "boxes should be a 2d tensor, got ", dets.dim(), "D"); + TORCH_CHECK( + dets.size(1) == 4, + "boxes should have 4 elements in dimension 1, got ", + dets.size(1)); + TORCH_CHECK( + scores.dim() == 1, + "scores should be a 1d tensor, got ", + scores.dim(), + "D"); + TORCH_CHECK( + dets.size(0) == scores.size(0), + "boxes and scores should have same number of elements in ", + "dimension 0, got ", + dets.size(0), + " and ", + scores.size(0)); + + auto result = at::empty({0}); + + AT_DISPATCH_QINT_TYPES(dets.scalar_type(), "qnms_kernel", [&] { + result = qnms_kernel_impl(dets, scores, iou_threshold); + }); + return result; +} + +} // namespace + +TORCH_LIBRARY_IMPL(torchvision, QuantizedCPU, m) { + m.impl(TORCH_SELECTIVE_NAME("torchvision::nms"), TORCH_FN(qnms_kernel)); +} + +} // namespace ops +} // namespace vision