-
Notifications
You must be signed in to change notification settings - Fork 7.1k
Add quantized version of nms #3601
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
Changes from 15 commits
Commits
Show all changes
21 commits
Select commit
Hold shift + click to select a range
1f2ecd8
Add quantized version of nms
NicolasHug b63bac6
Added tests
NicolasHug 41a4927
Compute areas only once
NicolasHug cdb44fc
remove calls to dequantize_val
NicolasHug fb451cc
fix return type for empty tensor
NicolasHug d92171c
Merge branch 'master' of github.com:pytorch/vision into qnms
NicolasHug 2eef613
flake8
NicolasHug 51943d8
Merge branch 'master' of github.com:pytorch/vision into qnms
NicolasHug f8a56d7
remove use of scale as it gets cancelled out
NicolasHug 4c75c56
simpler int convertion in tests
NicolasHug 2ac5313
explicitly set ovr to double
NicolasHug f47d238
add tests for more values of scale and zero_point
NicolasHug 4b31259
comment about underflow
NicolasHug 7c18714
remove unnecessary accessor
NicolasHug 31d76ce
properly convert to float for division
NicolasHug 618bbe1
Add comments about underflow
NicolasHug 7e27337
explicitely cast coordinates to float to allow vectorization
NicolasHug 5450689
Merge branch 'master' of github.com:pytorch/vision into qnms
NicolasHug 4cda6e5
clang
NicolasHug dd46376
clang again
NicolasHug aef04fd
hopefully OK now
NicolasHug File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,126 @@ | ||
#include <ATen/ATen.h> | ||
#include <ATen/native/quantized/affine_quantizer.h> | ||
#include <torch/library.h> | ||
|
||
namespace vision { | ||
namespace ops { | ||
|
||
namespace { | ||
|
||
template <typename scalar_t> | ||
at::Tensor qnms_kernel_impl( | ||
const at::Tensor& dets, | ||
const at::Tensor& scores, | ||
double iou_threshold) { | ||
|
||
TORCH_CHECK(!dets.is_cuda(), "dets must be a CPU tensor"); | ||
TORCH_CHECK(!scores.is_cuda(), "scores must be a CPU tensor"); | ||
TORCH_CHECK( | ||
dets.scalar_type() == scores.scalar_type(), | ||
"dets should have the same type as scores"); | ||
|
||
if (dets.numel() == 0) | ||
return at::empty({0}, dets.options().dtype(at::kLong)); | ||
|
||
const auto ndets = dets.size(0); | ||
|
||
auto x1_t = dets.select(1, 0).contiguous(); | ||
auto y1_t = dets.select(1, 1).contiguous(); | ||
auto x2_t = dets.select(1, 2).contiguous(); | ||
auto y2_t = dets.select(1, 3).contiguous(); | ||
auto order_t = std::get<1>(scores.sort(0, /* descending=*/true)); | ||
at::Tensor suppressed_t = at::zeros({ndets}, dets.options().dtype(at::kByte)); | ||
at::Tensor keep_t = at::zeros({ndets}, dets.options().dtype(at::kLong)); | ||
at::Tensor areas_t = at::zeros({ndets}, dets.options().dtype(at::kFloat)); | ||
|
||
auto suppressed = suppressed_t.data_ptr<uint8_t>(); | ||
auto keep = keep_t.data_ptr<int64_t>(); | ||
auto order = order_t.data_ptr<int64_t>(); | ||
auto x1 = x1_t.data_ptr<scalar_t>(); | ||
auto y1 = y1_t.data_ptr<scalar_t>(); | ||
auto x2 = x2_t.data_ptr<scalar_t>(); | ||
auto y2 = y2_t.data_ptr<scalar_t>(); | ||
auto areas = areas_t.data_ptr<float>(); | ||
|
||
for (int64_t i = 0; i < ndets; i++) { | ||
// Note 1: To get the exact area we'd need to multiply by scale**2, but this | ||
// would get canceled out in the computation of ovr below. So we leave that | ||
// out. | ||
// Note 2: degenerate boxes (x2 < x1 or y2 < y1) may underflow. Same below | ||
// when computing w and h | ||
areas[i] = (x2[i].val_ - x1[i].val_) * (y2[i].val_ - y1[i].val_); | ||
} | ||
|
||
int64_t num_to_keep = 0; | ||
|
||
for (int64_t _i = 0; _i < ndets; _i++) { | ||
auto i = order[_i]; | ||
if (suppressed[i] == 1) | ||
continue; | ||
keep[num_to_keep++] = i; | ||
|
||
auto ix1val = x1[i].val_; | ||
auto iy1val = y1[i].val_; | ||
auto ix2val = x2[i].val_; | ||
auto iy2val = y2[i].val_; | ||
auto iarea = areas[i]; | ||
|
||
for (int64_t _j = _i + 1; _j < ndets; _j++) { | ||
auto j = order[_j]; | ||
if (suppressed[j] == 1) | ||
continue; | ||
auto xx1 = std::max(ix1val, x1[j].val_); | ||
auto yy1 = std::max(iy1val, y1[j].val_); | ||
auto xx2 = std::min(ix2val, x2[j].val_); | ||
auto yy2 = std::min(iy2val, y2[j].val_); | ||
|
||
auto w = std::max(0, xx2 - xx1); // * scale (gets canceled below) | ||
auto h = std::max(0, yy2 - yy1); // * scale (gets canceled below) | ||
auto inter = w * h; | ||
auto ovr = (float)inter / (iarea + areas[j] - inter); | ||
if (ovr > iou_threshold) | ||
suppressed[j] = 1; | ||
} | ||
} | ||
return keep_t.narrow(/*dim=*/0, /*start=*/0, /*length=*/num_to_keep); | ||
} | ||
|
||
at::Tensor qnms_kernel( | ||
const at::Tensor& dets, | ||
const at::Tensor& scores, | ||
double iou_threshold) { | ||
TORCH_CHECK( | ||
dets.dim() == 2, "boxes should be a 2d tensor, got ", dets.dim(), "D"); | ||
TORCH_CHECK( | ||
dets.size(1) == 4, | ||
"boxes should have 4 elements in dimension 1, got ", | ||
dets.size(1)); | ||
TORCH_CHECK( | ||
scores.dim() == 1, | ||
"scores should be a 1d tensor, got ", | ||
scores.dim(), | ||
"D"); | ||
TORCH_CHECK( | ||
dets.size(0) == scores.size(0), | ||
"boxes and scores should have same number of elements in ", | ||
"dimension 0, got ", | ||
dets.size(0), | ||
" and ", | ||
scores.size(0)); | ||
|
||
auto result = at::empty({0}); | ||
|
||
AT_DISPATCH_QINT_TYPES(dets.scalar_type(), "qnms_kernel", [&] { | ||
result = qnms_kernel_impl<scalar_t>(dets, scores, iou_threshold); | ||
}); | ||
return result; | ||
} | ||
|
||
} // namespace | ||
|
||
TORCH_LIBRARY_IMPL(torchvision, QuantizedCPU, m) { | ||
m.impl(TORCH_SELECTIVE_NAME("torchvision::nms"), TORCH_FN(qnms_kernel)); | ||
} | ||
|
||
} // namespace ops | ||
} // namespace vision |
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
In fact I think it's possible to get an underflow on unsigned types here, even when boxes aren't degenerate: if box j is strictly left from box i and they don't overlap, then we'll have
xx2 < xx1
.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
yes, good catch, this happens more often than just with degenerate boxes here
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Trying to come up with a reproducible example of an underflow was quite informative, and I came to the conclusion that:
The reason for 1) is that the result of
a - b
is anint
, even if botha
andb
areuint8_t
: TIL about integral promotion (https://stackoverflow.com/questions/21421998/subtraction-operation-on-unsigned-char). Sincequint_8
is the only supported quantized unsigned type, such values can always be represented asint
s, and integral promotion will always happen.Regarding 2): It's possible to get an underflow by doing some explicit casts to
uint8_t
in theqnms
code. The following boxes and scores:should produce
10 5 0 5
forxx1, xx2, yy1, yy2
and thus lead to an underflow when computingw = xx2 - xx1
.However, the underflow leads to
w = 251
andinter = some_big_value = 1255
and thusovr ~= -2
(note the negative sign because ofsome_big_value
that gets subtracted to the denominator).Hence the
if (ovr > iou_threshold)
condition is never verified when there's an underflow, and that's actually correct since the correct value forovr
should have been0
anyway: underflows can only happen when the boxes are disjoint along at least one of the dimensions.So the results are unchanged, and it's thus impossible to actually test against underflows... I think.
EDIT: we now explicitly cast coordinates to float32 for allowing vectorization, so there's definitely no risk of underflow anymore when computing
w
orh
.