Skip to content

ENH: Support mask in unique #48109

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 13 commits into from
Aug 18, 2022
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions asv_bench/benchmarks/hash_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,21 @@ def time_unique(self, exponent):
pd.unique(self.a2)


class Unique:
params = ["Int64", "Float64"]
param_names = ["dtype"]

def setup(self, dtype):
self.ser = pd.Series(([1, pd.NA, 2] + list(range(100_000))) * 3, dtype=dtype)
self.ser_unique = pd.Series(list(range(300_000)) + [pd.NA], dtype=dtype)

def time_unique_with_duplicates(self, exponent):
pd.unique(self.ser)

def time_unique(self, exponent):
pd.unique(self.ser_unique)


class NumericSeriesIndexing:

params = [
Expand Down
65 changes: 59 additions & 6 deletions pandas/_libs/hashtable_class_helper.pxi.in
Original file line number Diff line number Diff line change
Expand Up @@ -521,7 +521,7 @@ cdef class {{name}}HashTable(HashTable):
def _unique(self, const {{dtype}}_t[:] values, {{name}}Vector uniques,
Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
object na_value=None, bint ignore_na=False,
object mask=None, bint return_inverse=False):
object mask=None, bint return_inverse=False, bint use_result_mask=False):
"""
Calculate unique values and labels (no sorting!)

Expand Down Expand Up @@ -551,6 +551,9 @@ cdef class {{name}}HashTable(HashTable):
return_inverse : bool, default False
Whether the mapping of the original array values to their location
in the vector of uniques should be returned.
use_result_mask: bool, default False
Whether to create a result mask for the unique values. Not supported
with return_inverse=True.

Returns
-------
Expand All @@ -566,14 +569,24 @@ cdef class {{name}}HashTable(HashTable):
{{c_type}} val, na_value2
khiter_t k
{{name}}VectorData *ud
bint use_na_value, use_mask
UInt8Vector result_mask
UInt8VectorData *rmd
bint use_na_value, use_mask, seen_na = False
uint8_t[:] mask_values

if return_inverse:
labels = np.empty(n, dtype=np.intp)
ud = uniques.data
use_na_value = na_value is not None
use_mask = mask is not None
if not use_mask and use_result_mask:
raise NotImplementedError # pragma: no cover

if use_result_mask and return_inverse:
raise NotImplementedError # pragma: no cover

result_mask = UInt8Vector()
rmd = result_mask.data

if use_mask:
mask_values = mask.view("uint8")
Expand Down Expand Up @@ -605,6 +618,27 @@ cdef class {{name}}HashTable(HashTable):
# and replace the corresponding label with na_sentinel
labels[i] = na_sentinel
continue
elif not ignore_na and use_result_mask:
if mask_values[i]:
if seen_na:
continue

seen_na = True
if needs_resize(ud):
with gil:
if uniques.external_view_exists:
raise ValueError("external reference to "
"uniques held, but "
"Vector.resize() needed")
uniques.resize()
if result_mask.external_view_exists:
raise ValueError("external reference to "
"result_mask held, but "
"Vector.resize() needed")
result_mask.resize()
append_data_{{dtype}}(ud, val)
append_data_uint8(rmd, 1)
continue

k = kh_get_{{dtype}}(self.table, val)

Expand All @@ -619,7 +653,16 @@ cdef class {{name}}HashTable(HashTable):
"uniques held, but "
"Vector.resize() needed")
uniques.resize()
if use_result_mask:
if result_mask.external_view_exists:
raise ValueError("external reference to "
"result_mask held, but "
"Vector.resize() needed")
result_mask.resize()
append_data_{{dtype}}(ud, val)
if use_result_mask:
append_data_uint8(rmd, 0)

if return_inverse:
self.table.vals[k] = count
labels[i] = count
Expand All @@ -632,9 +675,11 @@ cdef class {{name}}HashTable(HashTable):

if return_inverse:
return uniques.to_array(), labels.base # .base -> underlying ndarray
if use_result_mask:
return uniques.to_array(), result_mask.to_array()
return uniques.to_array()

def unique(self, const {{dtype}}_t[:] values, bint return_inverse=False):
def unique(self, const {{dtype}}_t[:] values, bint return_inverse=False, object mask=None):
"""
Calculate unique values and labels (no sorting!)

Expand All @@ -652,10 +697,14 @@ cdef class {{name}}HashTable(HashTable):
Unique values of input, not sorted
labels : ndarray[intp_t] (if return_inverse)
The labels from values to uniques
mask : ndarray[bool], optional
If not None, the mask is used as indicator for missing values
(True = missing, False = valid) instead of `na_value` or
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Was this meant to be added to the Parameters section?
(although actually also the return value changes if passing a mask, maybe that can be clarified as well)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes thanks. Moved the mask part up and added a result_mask return value

"""
uniques = {{name}}Vector()
use_result_mask = True if mask is not None else False
return self._unique(values, uniques, ignore_na=False,
return_inverse=return_inverse)
return_inverse=return_inverse, mask=mask, use_result_mask=use_result_mask)

def factorize(self, const {{dtype}}_t[:] values, Py_ssize_t na_sentinel=-1,
object na_value=None, object mask=None):
Expand Down Expand Up @@ -1013,7 +1062,7 @@ cdef class StringHashTable(HashTable):
return uniques.to_array(), labels.base # .base -> underlying ndarray
return uniques.to_array()

def unique(self, ndarray[object] values, bint return_inverse=False):
def unique(self, ndarray[object] values, bint return_inverse=False, object mask=None):
"""
Calculate unique values and labels (no sorting!)

Expand All @@ -1024,6 +1073,8 @@ cdef class StringHashTable(HashTable):
return_inverse : bool, default False
Whether the mapping of the original array values to their location
in the vector of uniques should be returned.
mask : ndarray[bool], optional
Not yet implemented for StringHashTable

Returns
-------
Expand Down Expand Up @@ -1266,7 +1317,7 @@ cdef class PyObjectHashTable(HashTable):
return uniques.to_array(), labels.base # .base -> underlying ndarray
return uniques.to_array()

def unique(self, ndarray[object] values, bint return_inverse=False):
def unique(self, ndarray[object] values, bint return_inverse=False, object mask=None):
"""
Calculate unique values and labels (no sorting!)

Expand All @@ -1277,6 +1328,8 @@ cdef class PyObjectHashTable(HashTable):
return_inverse : bool, default False
Whether the mapping of the original array values to their location
in the vector of uniques should be returned.
mask : ndarray[bool], optional
Not yet implemented for PyObjectHashTable

Returns
-------
Expand Down
19 changes: 16 additions & 3 deletions pandas/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -322,6 +322,8 @@ def unique(values):
Parameters
----------
values : 1d array-like
mask : 1d bool array
Mask of the values.

Returns
-------
Expand Down Expand Up @@ -404,6 +406,11 @@ def unique(values):
>>> pd.unique([("a", "b"), ("b", "a"), ("a", "c"), ("b", "a")])
array([('a', 'b'), ('b', 'a'), ('a', 'c')], dtype=object)
"""
return _unique(values)


def _unique(values, mask: npt.NDArray[np.bool_] | None = None):
"""See algorithms.unique for docs"""
values = _ensure_arraylike(values)

if is_extension_array_dtype(values.dtype):
Expand All @@ -414,9 +421,15 @@ def unique(values):
htable, values = _get_hashtable_algo(values)

table = htable(len(values))
uniques = table.unique(values)
uniques = _reconstruct_data(uniques, original.dtype, original)
return uniques
if mask is None:
uniques = table.unique(values)
uniques = _reconstruct_data(uniques, original.dtype, original)
return uniques

else:
uniques, mask = table.unique(values, mask=mask)
uniques = _reconstruct_data(uniques, original.dtype, original)
return uniques, mask.astype("bool")


unique1d = unique
Expand Down
11 changes: 11 additions & 0 deletions pandas/core/arrays/masked.py
Original file line number Diff line number Diff line change
Expand Up @@ -851,6 +851,17 @@ def copy(self: BaseMaskedArrayT) -> BaseMaskedArrayT:
mask = mask.copy()
return type(self)(data, mask, copy=False)

def unique(self: BaseMaskedArrayT) -> BaseMaskedArrayT:
"""
Compute the BaseMaskedArray of unique values.

Returns
-------
uniques : BaseMaskedArray
"""
uniques, mask = algos._unique(self._data, self._mask)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nitpick, can you rename the function to something non-private, e.g. unique_with_mask

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure, done

return type(self)(uniques, mask, copy=False)

@doc(ExtensionArray.searchsorted)
def searchsorted(
self,
Expand Down
7 changes: 7 additions & 0 deletions pandas/tests/test_algos.py
Original file line number Diff line number Diff line change
Expand Up @@ -834,6 +834,13 @@ def test_do_not_mangle_na_values(self, unique_nulls_fixture, unique_nulls_fixtur
assert a[0] is unique_nulls_fixture
assert a[1] is unique_nulls_fixture2

def test_unique_masked(self, any_numeric_ea_dtype):
# GH#48019
ser = Series([1, pd.NA, 2] * 3, dtype=any_numeric_ea_dtype)
result = pd.unique(ser)
expected = pd.array([1, pd.NA, 2], dtype=any_numeric_ea_dtype)
tm.assert_extension_array_equal(result, expected)


class TestIsin:
def test_invalid(self):
Expand Down