Skip to content

DOC/TST: Indexing with NA raises #30308

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 32 commits into from
Jan 3, 2020
Merged
Show file tree
Hide file tree
Changes from 18 commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
492f904
DOC/TST: Indexing with NA raises
TomAugspurger Dec 16, 2019
6444aa0
Merge remote-tracking branch 'upstream/master' into na-indexing-raises
TomAugspurger Dec 18, 2019
53f4f63
Handle BooleanArray in all EAs
TomAugspurger Dec 18, 2019
3bbf868
update
TomAugspurger Dec 18, 2019
a5ac457
fixups
TomAugspurger Dec 18, 2019
0dfe761
type
TomAugspurger Dec 18, 2019
dac111d
fix benchmark
TomAugspurger Dec 18, 2019
d1f08d9
fixup
TomAugspurger Dec 18, 2019
3dd59ca
typo
TomAugspurger Dec 18, 2019
151bdfe
updates
TomAugspurger Dec 19, 2019
d57b0ac
Revert "updates"
TomAugspurger Dec 19, 2019
36be0f6
examples
TomAugspurger Dec 20, 2019
7bd6c2f
restore datetime fix
TomAugspurger Dec 20, 2019
c5f3afb
Merge remote-tracking branch 'upstream/master' into na-indexing-raises
TomAugspurger Dec 20, 2019
76bb6ce
Merge branch 'master' of https://github.com/pandas-dev/pandas into na…
TomAugspurger Dec 28, 2019
505112e
update error message
TomAugspurger Dec 28, 2019
c73ae8e
checks
TomAugspurger Dec 28, 2019
3efe359
Merge remote-tracking branch 'upstream/master' into na-indexing-raises
TomAugspurger Dec 30, 2019
f94483f
update for error message
TomAugspurger Dec 30, 2019
953938d
Merge remote-tracking branch 'upstream/master' into na-indexing-raises
TomAugspurger Dec 30, 2019
8b1e567
update isort
TomAugspurger Dec 30, 2019
f317c64
isort
TomAugspurger Dec 30, 2019
c656292
fixup
TomAugspurger Dec 30, 2019
d4f0adc
Merge branch 'master' of https://github.com/pandas-dev/pandas into na…
TomAugspurger Dec 31, 2019
37ea95e
fixup
TomAugspurger Dec 31, 2019
816a47c
Merge remote-tracking branch 'upstream/master' into na-indexing-raises
TomAugspurger Jan 2, 2020
21fd589
update arrayo
TomAugspurger Jan 2, 2020
3637070
doc
TomAugspurger Jan 2, 2020
61599f2
integer
TomAugspurger Jan 2, 2020
6a0eda6
Merge remote-tracking branch 'upstream/master' into na-indexing-raises
TomAugspurger Jan 2, 2020
e622826
fixup
TomAugspurger Jan 2, 2020
5004d91
fixup
TomAugspurger Jan 2, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ repos:
language: python_venv
additional_dependencies: [flake8-comprehensions>=3.1.0]
- repo: https://github.com/pre-commit/mirrors-isort
rev: v4.3.20
rev: v4.3.21
hooks:
- id: isort
language: python_venv
Expand Down
11 changes: 8 additions & 3 deletions doc/source/reference/extensions.rst
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,6 @@ objects.
api.extensions.register_series_accessor
api.extensions.register_index_accessor
api.extensions.ExtensionDtype
api.extensions.is_bool_indexer
api.extensions.check_bool_array_indexer

.. autosummary::
:toctree: api/
Expand All @@ -28,7 +26,6 @@ objects.
api.extensions.ExtensionArray
arrays.PandasArray


.. We need this autosummary so that methods and attributes are generated.
.. Separate block, since they aren't classes.

Expand Down Expand Up @@ -62,3 +59,11 @@ objects.
api.extensions.ExtensionArray.nbytes
api.extensions.ExtensionArray.ndim
api.extensions.ExtensionArray.shape

Additionally, we have some utility methods for ensuring your object
behaves correctly.

.. autosummary::
:toctree: api/

api.indexers.check_bool_array_indexer
2 changes: 0 additions & 2 deletions pandas/api/extensions/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,5 +11,3 @@
)
from pandas.core.algorithms import take # noqa: F401
from pandas.core.arrays import ExtensionArray, ExtensionScalarOpsMixin # noqa: F401
from pandas.core.common import is_bool_indexer # noqa: F401
from pandas.core.indexing import check_bool_array_indexer # noqa: F401
1 change: 1 addition & 0 deletions pandas/api/indexers/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
"""Public API for Rolling Window Indexers"""
from pandas.core.indexers import check_bool_array_indexer # noqa: F401
from pandas.core.window.indexers import BaseIndexer # noqa: F401
8 changes: 3 additions & 5 deletions pandas/core/arrays/boolean.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,8 @@
from pandas.core import nanops, ops
from pandas.core.algorithms import take
from pandas.core.arrays import ExtensionArray, ExtensionOpsMixin
from pandas.core.common import is_bool_indexer
import pandas.core.common as com
from pandas.core.indexers import check_bool_array_indexer

if TYPE_CHECKING:
from pandas._typing import Scalar
Expand Down Expand Up @@ -316,15 +317,12 @@ def _hasna(self) -> bool:
return self._mask.any()

def __getitem__(self, item):
# import here to avoid circular import. Probably need to restructure
from pandas.core.indexing import check_bool_array_indexer

if is_integer(item):
if self._mask[item]:
return self.dtype.na_value
return self._data[item]

elif is_bool_indexer(item):
elif com.is_bool_indexer(item):
item = check_bool_array_indexer(self, item)

return type(self)(self._data[item], self._mask[item])
Expand Down
3 changes: 1 addition & 2 deletions pandas/core/arrays/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@
from pandas.core.base import NoNewAttributesMixin, PandasObject, _shared_docs
import pandas.core.common as com
from pandas.core.construction import array, extract_array, sanitize_array
from pandas.core.indexers import check_bool_array_indexer
from pandas.core.missing import interpolate_2d
from pandas.core.ops.common import unpack_zerodim_and_defer
from pandas.core.sorting import nargsort
Expand Down Expand Up @@ -1990,8 +1991,6 @@ def __getitem__(self, key):
"""
Return an item.
"""
from pandas.core.indexing import check_bool_array_indexer

if isinstance(key, (int, np.integer)):
i = self._codes[key]
if i == -1:
Expand Down
3 changes: 1 addition & 2 deletions pandas/core/arrays/datetimelike.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@
from pandas.core import missing, nanops
from pandas.core.algorithms import checked_add_with_arr, take, unique1d, value_counts
import pandas.core.common as com
from pandas.core.indexers import check_bool_array_indexer
from pandas.core.ops.common import unpack_zerodim_and_defer
from pandas.core.ops.invalid import make_invalid_op

Expand Down Expand Up @@ -436,8 +437,6 @@ def __getitem__(self, key):
return type(self)(val, dtype=self.dtype)

if com.is_bool_indexer(key):
from pandas.core.indexing import check_bool_array_indexer

key = check_bool_array_indexer(self, key)
if key.all():
key = slice(0, None, None)
Expand Down
8 changes: 3 additions & 5 deletions pandas/core/arrays/integer.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,8 @@
from pandas.core import nanops, ops
from pandas.core.algorithms import take
from pandas.core.arrays import ExtensionArray, ExtensionOpsMixin
from pandas.core.common import is_bool_indexer
import pandas.core.common as com
from pandas.core.indexers import check_bool_array_indexer
from pandas.core.ops import invalid_comparison
from pandas.core.ops.common import unpack_zerodim_and_defer
from pandas.core.tools.numeric import to_numeric
Expand Down Expand Up @@ -365,15 +366,12 @@ def _from_factorized(cls, values, original):
return integer_array(values, dtype=original.dtype)

def __getitem__(self, item):
# Importing this at the top-level causes many unrelated(?) mypy failures
from pandas.core.indexing import check_bool_array_indexer

if is_integer(item):
if self._mask[item]:
return self.dtype.na_value
return self._data[item]

elif is_bool_indexer(item):
elif com.is_bool_indexer(item):
item = check_bool_array_indexer(self, item)

return type(self)(self._data[item], self._mask[item])
Expand Down
8 changes: 3 additions & 5 deletions pandas/core/arrays/numpy_.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,9 @@
from pandas import compat
from pandas.core import nanops
from pandas.core.algorithms import searchsorted, take, unique
from pandas.core.common import is_bool_indexer
import pandas.core.common as com
from pandas.core.construction import extract_array
from pandas.core.indexers import check_bool_array_indexer
from pandas.core.missing import backfill_1d, pad_1d

from .base import ExtensionArray, ExtensionOpsMixin
Expand Down Expand Up @@ -232,13 +233,10 @@ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
# Pandas ExtensionArray Interface

def __getitem__(self, item):
# Avoid mypy failures when importing at the top-level
from pandas.core.indexing import check_bool_array_indexer

if isinstance(item, type(self)):
item = item._ndarray

elif is_bool_indexer(item):
elif com.is_bool_indexer(item):
item = check_bool_array_indexer(self, item)

result = self._ndarray[item]
Expand Down
6 changes: 3 additions & 3 deletions pandas/core/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,10 +121,10 @@ def is_bool_indexer(key: Any) -> bool:

See Also
--------
api.extensions.check_bool_array_indexer : Check that `key`
is a valid mask for an array, and convert to an ndarary.
check_bool_array_indexer : Check that `key`
is a valid mask for an array, and convert to an ndarray.
"""
na_msg = "cannot index with vector containing NA / NaN values"
na_msg = "cannot mask with array containing NA / NaN values"
if isinstance(key, (ABCSeries, np.ndarray, ABCIndex)) or (
is_array_like(key) and is_extension_array_dtype(key.dtype)
):
Expand Down
67 changes: 67 additions & 0 deletions pandas/core/indexers.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
"""
import numpy as np

from pandas._typing import AnyArrayLike

from pandas.core.dtypes.common import is_list_like
from pandas.core.dtypes.generic import ABCIndexClass, ABCSeries

Expand Down Expand Up @@ -240,3 +242,68 @@ def length_of_indexer(indexer, target=None) -> int:
elif not is_list_like_indexer(indexer):
return 1
raise AssertionError("cannot find the length of the indexer")


def check_bool_array_indexer(array: AnyArrayLike, mask: AnyArrayLike) -> np.ndarray:
"""
Check if `mask` is a valid boolean indexer for `array`.

`array` and `mask` are checked to have the same length, and the
dtype is validated.

.. versionadded:: 1.0.0

Parameters
----------
array : array
The array that's being masked.
mask : array
The boolean array that's masking.

Returns
-------
numpy.ndarray
The validated boolean mask.

Raises
------
IndexError
When the lengths don't match.
ValueError
When `mask` cannot be converted to a bool-dtype ndarray.

See Also
--------
api.extensions.is_bool_indexer : Check if `key` is a boolean indexer.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this does not exist now (will do a PR)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.


Examples
--------
A boolean ndarray is returned when the arguments are all valid.

>>> mask = pd.array([True, False])
>>> arr = pd.Series([1, 2])
>>> pd.api.extensions.check_bool_array_indexer(arr, mask)
array([ True, False])

An IndexError is raised when the lengths don't match.

>>> mask = pd.array([True, False, True])
>>> pd.api.extensions.check_bool_array_indexer(arr, mask)
Traceback (most recent call last):
...
IndexError: Item wrong length 3 instead of 2.

A ValueError is raised when the mask cannot be converted to
a bool-dtype ndarray.

>>> mask = pd.array([True, pd.NA])
>>> pd.api.extensions.check_bool_array_indexer(arr, mask)
Traceback (most recent call last):
...
ValueError: cannot convert to bool numpy array in presence of missing values
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we try to improve this error message? We know this is called in terms of indexing, and then something like "Cannot do boolean indexing with missing values, use fillna(True/False) ..." would be a much more useful error message than the message about conversion to numpy array.

"""
result = np.asarray(mask, dtype=bool)
# GH26658
if len(result) != len(array):
raise IndexError(f"Item wrong length {len(result)} instead of {len(array)}.")
return result
70 changes: 5 additions & 65 deletions pandas/core/indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,12 @@
from pandas.core.dtypes.generic import ABCDataFrame, ABCMultiIndex, ABCSeries
from pandas.core.dtypes.missing import _infer_fill_value, isna

from pandas._typing import AnyArrayLike
import pandas.core.common as com
from pandas.core.indexers import is_list_like_indexer, length_of_indexer
from pandas.core.indexers import (
check_bool_array_indexer,
is_list_like_indexer,
length_of_indexer,
)
from pandas.core.indexes.api import Index, InvalidIndexError


Expand Down Expand Up @@ -2270,69 +2273,6 @@ def convert_to_index_sliceable(obj, key):
return None


def check_bool_array_indexer(array: AnyArrayLike, mask: AnyArrayLike) -> np.ndarray:
"""
Check if `mask` is a valid boolean indexer for `array`.

`array` and `mask` are checked to have the same length, and the
dtype is validated.

Parameters
----------
array : array
The array that's being masked.
mask : array
The boolean array that's masking.

Returns
-------
numpy.ndarray
The validated boolean mask.

Raises
------
IndexError
When the lengths don't match.
ValueError
When `mask` cannot be converted to a bool-dtype ndarray.

See Also
--------
api.extensions.is_bool_indexer : Check if `key` is a boolean indexer.

Examples
--------
A boolean ndarray is returned when the arguments are all valid.

>>> mask = pd.array([True, False])
>>> arr = pd.Series([1, 2])
>>> pd.api.extensions.check_bool_array_indexer(arr, mask)
array([ True, False])

An IndexError is raised when the lengths don't match.

>>> mask = pd.array([True, False, True])
>>> pd.api.extensions.check_bool_array_indexer(arr, mask)
Traceback (most recent call last):
...
IndexError: Item wrong length 3 instead of 2.

A ValueError is raised when the mask cannot be converted to
a bool-dtype ndarray.

>>> mask = pd.array([True, pd.NA])
>>> pd.api.extensions.check_bool_array_indexer(arr, mask)
Traceback (most recent call last):
...
ValueError: cannot convert to bool numpy array in presence of missing values
"""
result = np.asarray(mask, dtype=bool)
# GH26658
if len(result) != len(array):
raise IndexError(f"Item wrong length {len(result)} instead of {len(array)}.")
return result


def check_bool_indexer(index: Index, key) -> np.ndarray:
"""
Check if key is a valid boolean indexer for an object with such index and
Expand Down
4 changes: 2 additions & 2 deletions pandas/tests/extension/base/getitem.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,7 @@ def test_getitem_mask_raises(self, data):
with pytest.raises(IndexError):
data[mask]

def test_getitem_boolenarray_mask(self, data):
def test_getitem_boolean_array_mask(self, data):
mask = pd.array(np.zeros(data.shape, dtype="bool"), dtype="boolean")
result = data[mask]
assert len(result) == 0
Expand All @@ -149,7 +149,7 @@ def test_getitem_boolenarray_mask(self, data):
result = pd.Series(data)[mask]
self.assert_series_equal(result, expected)

def test_getitem_boolenarray_mask_raises(self, data):
def test_getitem_boolean_array_mask_raises(self, data):
mask = pd.array(np.zeros(data.shape, dtype="bool"), dtype="boolean")
mask[:2] = pd.NA
with pytest.raises(ValueError):
Expand Down
17 changes: 10 additions & 7 deletions pandas/tests/extension/decimal/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,7 @@
from pandas.core.dtypes.base import ExtensionDtype

import pandas as pd
from pandas.api.extensions import (
check_bool_array_indexer,
is_bool_indexer,
register_extension_dtype,
)
from pandas.api.extensions import register_extension_dtype
from pandas.core.arrays import ExtensionArray, ExtensionScalarOpsMixin


Expand Down Expand Up @@ -113,8 +109,15 @@ def __getitem__(self, item):
if isinstance(item, numbers.Integral):
return self._data[item]
else:
if is_bool_indexer(item):
item = check_bool_array_indexer(self, item)
# array, slice.
if pd.api.types.is_list_like(item):
if not pd.api.types.is_array_like(item):
item = pd.array(item)
dtype = item.dtype
if pd.api.types.is_bool_dtype(dtype):
item = pd.api.indexers.check_bool_array_indexer(self, item)
elif pd.api.types.is_integer_dtype(dtype):
item = np.asarray(item, dtype="int")
return type(self)(self._data[item])

def take(self, indexer, allow_fill=False, fill_value=None):
Expand Down
Loading