Skip to content

Add quantized version of nms #3601

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 21 commits into from
Mar 30, 2021
Merged
Show file tree
Hide file tree
Changes from 15 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 5 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,8 +138,11 @@ def get_extensions():

main_file = glob.glob(os.path.join(extensions_dir, '*.cpp')) + glob.glob(os.path.join(extensions_dir, 'ops',
'*.cpp'))
source_cpu = glob.glob(os.path.join(extensions_dir, 'ops', 'autograd', '*.cpp')) + glob.glob(
os.path.join(extensions_dir, 'ops', 'cpu', '*.cpp'))
source_cpu = (
glob.glob(os.path.join(extensions_dir, 'ops', 'autograd', '*.cpp')) +
glob.glob(os.path.join(extensions_dir, 'ops', 'cpu', '*.cpp')) +
glob.glob(os.path.join(extensions_dir, 'ops', 'quantized', 'cpu', '*.cpp'))
)

is_rocm_pytorch = False
if torch.__version__ >= '1.5':
Expand Down
23 changes: 23 additions & 0 deletions test/test_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -418,6 +418,29 @@ def test_nms(self):
self.assertRaises(RuntimeError, ops.nms, torch.rand(3, 4), torch.rand(3, 2), 0.5)
self.assertRaises(RuntimeError, ops.nms, torch.rand(3, 4), torch.rand(4), 0.5)

def test_qnms(self):
# Note: we compare qnms vs nms instead of qnms vs reference implementation.
# This is because with the int convertion, the trick used in _create_tensors_with_iou
# doesn't really work (in fact, nms vs reference implem will also fail with ints)
err_msg = 'NMS and QNMS give different results for IoU={}'
for iou in [0.2, 0.5, 0.8]:
for scale, zero_point in ((1, 0), (2, 50), (3, 10)):
boxes, scores = self._create_tensors_with_iou(1000, iou)
scores *= 100 # otherwise most scores would be 0 or 1 after int convertion

qboxes = torch.quantize_per_tensor(boxes, scale=scale, zero_point=zero_point,
dtype=torch.quint8)
qscores = torch.quantize_per_tensor(scores, scale=scale, zero_point=zero_point,
dtype=torch.quint8)

boxes = qboxes.dequantize()
scores = qscores.dequantize()

keep = ops.nms(boxes, scores, iou)
qkeep = ops.nms(qboxes, qscores, iou)

self.assertTrue(torch.allclose(qkeep, keep), err_msg.format(iou))

@unittest.skipIf(not torch.cuda.is_available(), "CUDA unavailable")
def test_nms_cuda(self, dtype=torch.float64):
tol = 1e-3 if dtype is torch.half else 1e-5
Expand Down
126 changes: 126 additions & 0 deletions torchvision/csrc/ops/quantized/cpu/qnms_kernel.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
#include <ATen/ATen.h>
#include <ATen/native/quantized/affine_quantizer.h>
#include <torch/library.h>

namespace vision {
namespace ops {

namespace {

template <typename scalar_t>
at::Tensor qnms_kernel_impl(
const at::Tensor& dets,
const at::Tensor& scores,
double iou_threshold) {

TORCH_CHECK(!dets.is_cuda(), "dets must be a CPU tensor");
TORCH_CHECK(!scores.is_cuda(), "scores must be a CPU tensor");
TORCH_CHECK(
dets.scalar_type() == scores.scalar_type(),
"dets should have the same type as scores");

if (dets.numel() == 0)
return at::empty({0}, dets.options().dtype(at::kLong));

const auto ndets = dets.size(0);

auto x1_t = dets.select(1, 0).contiguous();
auto y1_t = dets.select(1, 1).contiguous();
auto x2_t = dets.select(1, 2).contiguous();
auto y2_t = dets.select(1, 3).contiguous();
auto order_t = std::get<1>(scores.sort(0, /* descending=*/true));
at::Tensor suppressed_t = at::zeros({ndets}, dets.options().dtype(at::kByte));
at::Tensor keep_t = at::zeros({ndets}, dets.options().dtype(at::kLong));
at::Tensor areas_t = at::zeros({ndets}, dets.options().dtype(at::kFloat));

auto suppressed = suppressed_t.data_ptr<uint8_t>();
auto keep = keep_t.data_ptr<int64_t>();
auto order = order_t.data_ptr<int64_t>();
auto x1 = x1_t.data_ptr<scalar_t>();
auto y1 = y1_t.data_ptr<scalar_t>();
auto x2 = x2_t.data_ptr<scalar_t>();
auto y2 = y2_t.data_ptr<scalar_t>();
auto areas = areas_t.data_ptr<float>();

for (int64_t i = 0; i < ndets; i++) {
// Note 1: To get the exact area we'd need to multiply by scale**2, but this
// would get canceled out in the computation of ovr below. So we leave that
// out.
// Note 2: degenerate boxes (x2 < x1 or y2 < y1) may underflow. Same below
// when computing w and h
areas[i] = (x2[i].val_ - x1[i].val_) * (y2[i].val_ - y1[i].val_);
}

int64_t num_to_keep = 0;

for (int64_t _i = 0; _i < ndets; _i++) {
auto i = order[_i];
if (suppressed[i] == 1)
continue;
keep[num_to_keep++] = i;

auto ix1val = x1[i].val_;
auto iy1val = y1[i].val_;
auto ix2val = x2[i].val_;
auto iy2val = y2[i].val_;
auto iarea = areas[i];

for (int64_t _j = _i + 1; _j < ndets; _j++) {
auto j = order[_j];
if (suppressed[j] == 1)
continue;
auto xx1 = std::max(ix1val, x1[j].val_);
auto yy1 = std::max(iy1val, y1[j].val_);
auto xx2 = std::min(ix2val, x2[j].val_);
auto yy2 = std::min(iy2val, y2[j].val_);

auto w = std::max(0, xx2 - xx1); // * scale (gets canceled below)
auto h = std::max(0, yy2 - yy1); // * scale (gets canceled below)
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In fact I think it's possible to get an underflow on unsigned types here, even when boxes aren't degenerate: if box j is strictly left from box i and they don't overlap, then we'll have xx2 < xx1.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes, good catch, this happens more often than just with degenerate boxes here

Copy link
Member Author

@NicolasHug NicolasHug Mar 29, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Trying to come up with a reproducible example of an underflow was quite informative, and I came to the conclusion that:

  1. they actually don't happen
  2. Even when they do (by manually forcing them to happen), they're effectively harmless, i.e. they don't change the end result. Since they don't change the result, it's actually impossible to test against underflow.

The reason for 1) is that the result of a - b is an int, even if both a and b are uint8_t: TIL about integral promotion (https://stackoverflow.com/questions/21421998/subtraction-operation-on-unsigned-char). Since quint_8 is the only supported quantized unsigned type, such values can always be represented as ints, and integral promotion will always happen.

Regarding 2): It's possible to get an underflow by doing some explicit casts to uint8_t in the qnms code. The following boxes and scores:

    boxes = torch.as_tensor([[0, 0, 5, 5], [10, 0, 20, 20]]).to(torch.float)
    scores = torch.as_tensor([10, 9]).to(torch.float)

should produce 10 5 0 5 for xx1, xx2, yy1, yy2 and thus lead to an underflow when computing w = xx2 - xx1.

However, the underflow leads to w = 251 and inter = some_big_value = 1255 and thus ovr ~= -2 (note the negative sign because of some_big_value that gets subtracted to the denominator).

Hence the if (ovr > iou_threshold) condition is never verified when there's an underflow, and that's actually correct since the correct value for ovr should have been 0 anyway: underflows can only happen when the boxes are disjoint along at least one of the dimensions.

So the results are unchanged, and it's thus impossible to actually test against underflows... I think.

EDIT: we now explicitly cast coordinates to float32 for allowing vectorization, so there's definitely no risk of underflow anymore when computing w or h.

auto inter = w * h;
auto ovr = (float)inter / (iarea + areas[j] - inter);
if (ovr > iou_threshold)
suppressed[j] = 1;
}
}
return keep_t.narrow(/*dim=*/0, /*start=*/0, /*length=*/num_to_keep);
}

at::Tensor qnms_kernel(
const at::Tensor& dets,
const at::Tensor& scores,
double iou_threshold) {
TORCH_CHECK(
dets.dim() == 2, "boxes should be a 2d tensor, got ", dets.dim(), "D");
TORCH_CHECK(
dets.size(1) == 4,
"boxes should have 4 elements in dimension 1, got ",
dets.size(1));
TORCH_CHECK(
scores.dim() == 1,
"scores should be a 1d tensor, got ",
scores.dim(),
"D");
TORCH_CHECK(
dets.size(0) == scores.size(0),
"boxes and scores should have same number of elements in ",
"dimension 0, got ",
dets.size(0),
" and ",
scores.size(0));

auto result = at::empty({0});

AT_DISPATCH_QINT_TYPES(dets.scalar_type(), "qnms_kernel", [&] {
result = qnms_kernel_impl<scalar_t>(dets, scores, iou_threshold);
});
return result;
}

} // namespace

TORCH_LIBRARY_IMPL(torchvision, QuantizedCPU, m) {
m.impl(TORCH_SELECTIVE_NAME("torchvision::nms"), TORCH_FN(qnms_kernel));
}

} // namespace ops
} // namespace vision