-
-
Notifications
You must be signed in to change notification settings - Fork 18.7k
ENH: Support mask in unique #48109
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
ENH: Support mask in unique #48109
Changes from 9 commits
6029453
2074bd9
539c1ba
b87ddbe
38bf997
676d5c6
08ad510
e829db7
dce9ef8
59d50af
f6d3137
362f824
101018c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -521,7 +521,7 @@ cdef class {{name}}HashTable(HashTable): | |
def _unique(self, const {{dtype}}_t[:] values, {{name}}Vector uniques, | ||
Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1, | ||
object na_value=None, bint ignore_na=False, | ||
object mask=None, bint return_inverse=False): | ||
object mask=None, bint return_inverse=False, bint use_result_mask=False): | ||
""" | ||
Calculate unique values and labels (no sorting!) | ||
|
||
|
@@ -551,6 +551,9 @@ cdef class {{name}}HashTable(HashTable): | |
return_inverse : bool, default False | ||
Whether the mapping of the original array values to their location | ||
in the vector of uniques should be returned. | ||
use_result_mask: bool, default False | ||
Whether to create a result mask for the unique values. Not supported | ||
with return_inverse=True. | ||
|
||
Returns | ||
------- | ||
|
@@ -566,14 +569,24 @@ cdef class {{name}}HashTable(HashTable): | |
{{c_type}} val, na_value2 | ||
khiter_t k | ||
{{name}}VectorData *ud | ||
bint use_na_value, use_mask | ||
UInt8Vector result_mask | ||
UInt8VectorData *rmd | ||
bint use_na_value, use_mask, seen_na = False | ||
uint8_t[:] mask_values | ||
|
||
if return_inverse: | ||
labels = np.empty(n, dtype=np.intp) | ||
ud = uniques.data | ||
use_na_value = na_value is not None | ||
use_mask = mask is not None | ||
if not use_mask and use_result_mask: | ||
raise NotImplementedError # pragma: no cover | ||
|
||
if use_result_mask and return_inverse: | ||
raise NotImplementedError # pragma: no cover | ||
|
||
result_mask = UInt8Vector() | ||
rmd = result_mask.data | ||
|
||
if use_mask: | ||
mask_values = mask.view("uint8") | ||
|
@@ -605,6 +618,27 @@ cdef class {{name}}HashTable(HashTable): | |
# and replace the corresponding label with na_sentinel | ||
labels[i] = na_sentinel | ||
continue | ||
elif not ignore_na and use_result_mask: | ||
if mask_values[i]: | ||
if seen_na: | ||
continue | ||
|
||
seen_na = True | ||
if needs_resize(ud): | ||
with gil: | ||
if uniques.external_view_exists: | ||
raise ValueError("external reference to " | ||
"uniques held, but " | ||
"Vector.resize() needed") | ||
uniques.resize() | ||
if result_mask.external_view_exists: | ||
raise ValueError("external reference to " | ||
"result_mask held, but " | ||
"Vector.resize() needed") | ||
result_mask.resize() | ||
append_data_{{dtype}}(ud, val) | ||
append_data_uint8(rmd, 1) | ||
continue | ||
|
||
k = kh_get_{{dtype}}(self.table, val) | ||
|
||
|
@@ -619,7 +653,16 @@ cdef class {{name}}HashTable(HashTable): | |
"uniques held, but " | ||
"Vector.resize() needed") | ||
uniques.resize() | ||
if use_result_mask: | ||
if result_mask.external_view_exists: | ||
raise ValueError("external reference to " | ||
"result_mask held, but " | ||
"Vector.resize() needed") | ||
result_mask.resize() | ||
append_data_{{dtype}}(ud, val) | ||
if use_result_mask: | ||
append_data_uint8(rmd, 0) | ||
|
||
if return_inverse: | ||
self.table.vals[k] = count | ||
labels[i] = count | ||
|
@@ -632,9 +675,11 @@ cdef class {{name}}HashTable(HashTable): | |
|
||
if return_inverse: | ||
return uniques.to_array(), labels.base # .base -> underlying ndarray | ||
if use_result_mask: | ||
return uniques.to_array(), result_mask.to_array() | ||
return uniques.to_array() | ||
|
||
def unique(self, const {{dtype}}_t[:] values, bint return_inverse=False): | ||
def unique(self, const {{dtype}}_t[:] values, bint return_inverse=False, object mask=None): | ||
""" | ||
Calculate unique values and labels (no sorting!) | ||
|
||
|
@@ -652,10 +697,14 @@ cdef class {{name}}HashTable(HashTable): | |
Unique values of input, not sorted | ||
labels : ndarray[intp_t] (if return_inverse) | ||
The labels from values to uniques | ||
mask : ndarray[bool], optional | ||
If not None, the mask is used as indicator for missing values | ||
(True = missing, False = valid) instead of `na_value` or | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Was this meant to be added to the Parameters section? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes thanks. Moved the mask part up and added a result_mask return value |
||
""" | ||
uniques = {{name}}Vector() | ||
use_result_mask = True if mask is not None else False | ||
return self._unique(values, uniques, ignore_na=False, | ||
return_inverse=return_inverse) | ||
return_inverse=return_inverse, mask=mask, use_result_mask=use_result_mask) | ||
|
||
def factorize(self, const {{dtype}}_t[:] values, Py_ssize_t na_sentinel=-1, | ||
object na_value=None, object mask=None): | ||
|
@@ -1013,7 +1062,7 @@ cdef class StringHashTable(HashTable): | |
return uniques.to_array(), labels.base # .base -> underlying ndarray | ||
return uniques.to_array() | ||
|
||
def unique(self, ndarray[object] values, bint return_inverse=False): | ||
def unique(self, ndarray[object] values, bint return_inverse=False, object mask=None): | ||
""" | ||
Calculate unique values and labels (no sorting!) | ||
|
||
|
@@ -1024,6 +1073,8 @@ cdef class StringHashTable(HashTable): | |
return_inverse : bool, default False | ||
Whether the mapping of the original array values to their location | ||
in the vector of uniques should be returned. | ||
mask : ndarray[bool], optional | ||
Not yet implemented for StringHashTable | ||
|
||
Returns | ||
------- | ||
|
@@ -1266,7 +1317,7 @@ cdef class PyObjectHashTable(HashTable): | |
return uniques.to_array(), labels.base # .base -> underlying ndarray | ||
return uniques.to_array() | ||
|
||
def unique(self, ndarray[object] values, bint return_inverse=False): | ||
def unique(self, ndarray[object] values, bint return_inverse=False, object mask=None): | ||
""" | ||
Calculate unique values and labels (no sorting!) | ||
|
||
|
@@ -1277,6 +1328,8 @@ cdef class PyObjectHashTable(HashTable): | |
return_inverse : bool, default False | ||
Whether the mapping of the original array values to their location | ||
in the vector of uniques should be returned. | ||
mask : ndarray[bool], optional | ||
Not yet implemented for PyObjectHashTable | ||
|
||
Returns | ||
------- | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -851,6 +851,17 @@ def copy(self: BaseMaskedArrayT) -> BaseMaskedArrayT: | |
mask = mask.copy() | ||
return type(self)(data, mask, copy=False) | ||
|
||
def unique(self: BaseMaskedArrayT) -> BaseMaskedArrayT: | ||
""" | ||
Compute the BaseMaskedArray of unique values. | ||
|
||
Returns | ||
------- | ||
uniques : BaseMaskedArray | ||
""" | ||
uniques, mask = algos._unique(self._data, self._mask) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nitpick, can you rename the function to something non-private, e.g. unique_with_mask There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sure, done |
||
return type(self)(uniques, mask, copy=False) | ||
|
||
@doc(ExtensionArray.searchsorted) | ||
def searchsorted( | ||
self, | ||
|
Uh oh!
There was an error while loading. Please reload this page.