diff --git a/asv_bench/benchmarks/sparse.py b/asv_bench/benchmarks/sparse.py index ff1c4c92fe551..8f424685dfbae 100644 --- a/asv_bench/benchmarks/sparse.py +++ b/asv_bench/benchmarks/sparse.py @@ -195,4 +195,17 @@ def time_take(self, indices, allow_fill): self.sp_arr.take(indices, allow_fill=allow_fill) +class GetItem: + def setup(self): + N = 1_000_000 + arr = make_array(N, 1e-5, np.nan, np.float64) + self.sp_arr = SparseArray(arr) + + def time_integer_indexing(self): + self.sp_arr[78] + + def time_slice(self): + self.sp_arr[1:] + + from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 14881293ceb1f..d65d21878a584 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -357,8 +357,10 @@ Performance improvements - Performance improvement in indexing with a :class:`MultiIndex` indexer on another :class:`MultiIndex` (:issue:43370`) - Performance improvement in :meth:`GroupBy.quantile` (:issue:`43469`) - :meth:`SparseArray.min` and :meth:`SparseArray.max` no longer require converting to a dense array (:issue:`43526`) +- Indexing into a :class:`SparseArray` with a ``slice`` with ``step=1`` no longer requires converting to a dense array (:issue:`43777`) - Performance improvement in :meth:`SparseArray.take` with ``allow_fill=False`` (:issue:`43654`) - Performance improvement in :meth:`.Rolling.mean` and :meth:`.Expanding.mean` with ``engine="numba"`` (:issue:`43612`) +- .. --------------------------------------------------------------------------- diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index 57f9c7262bce3..ba7cf7d1bbfee 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -892,13 +892,39 @@ def __getitem__( elif isinstance(key, tuple): data_slice = self.to_dense()[key] elif isinstance(key, slice): - # special case to preserve dtypes - if key == slice(None): - return self.copy() - # TODO: this logic is surely elsewhere - # TODO: this could be more efficient - indices = np.arange(len(self), dtype=np.int32)[key] - return self.take(indices) + + # Avoid densifying when handling contiguous slices + if key.step is None or key.step == 1: + start = 0 if key.start is None else key.start + if start < 0: + start += len(self) + + end = len(self) if key.stop is None else key.stop + if end < 0: + end += len(self) + + indices = self.sp_index.to_int_index().indices + keep_inds = np.flatnonzero((indices >= start) & (indices < end)) + sp_vals = self.sp_values[keep_inds] + + sp_index = indices[keep_inds].copy() + + # If we've sliced to not include the start of the array, all our indices + # should be shifted. NB: here we are careful to also not shift by a + # negative value for a case like [0, 1][-100:] where the start index + # should be treated like 0 + if start > 0: + sp_index -= start + + # Length of our result should match applying this slice to a range + # of the length of our original array + new_len = len(range(len(self))[key]) + new_sp_index = make_sparse_index(new_len, sp_index, self.kind) + return type(self)._simple_new(sp_vals, new_sp_index, self.dtype) + else: + indices = np.arange(len(self), dtype=np.int32)[key] + return self.take(indices) + else: # TODO: I think we can avoid densifying when masking a # boolean SparseArray with another. Need to look at the @@ -1745,10 +1771,10 @@ def make_sparse_index(length: int, indices, kind: Literal["integer"]) -> IntInde def make_sparse_index(length: int, indices, kind: SparseIndexKind) -> SparseIndex: index: SparseIndex - if kind == "block" or isinstance(kind, BlockIndex): + if kind == "block": locs, lens = splib.get_blocks(indices) index = BlockIndex(length, locs, lens) - elif kind == "integer" or isinstance(kind, IntIndex): + elif kind == "integer": index = IntIndex(length, indices) else: # pragma: no cover raise ValueError("must be block or integer type") diff --git a/pandas/tests/arrays/sparse/test_array.py b/pandas/tests/arrays/sparse/test_array.py index 34ee68dbbbf18..0b00ff2dbd861 100644 --- a/pandas/tests/arrays/sparse/test_array.py +++ b/pandas/tests/arrays/sparse/test_array.py @@ -679,23 +679,37 @@ def test_getitem_arraylike_mask(self): expected = SparseArray([0, 2]) tm.assert_sp_array_equal(result, expected) - def test_getslice(self): - result = self.arr[:-3] - exp = SparseArray(self.arr.to_dense()[:-3]) - tm.assert_sp_array_equal(result, exp) - - result = self.arr[-4:] - exp = SparseArray(self.arr.to_dense()[-4:]) - tm.assert_sp_array_equal(result, exp) - - # two corner cases from Series - result = self.arr[-12:] - exp = SparseArray(self.arr) - tm.assert_sp_array_equal(result, exp) - - result = self.arr[:-12] - exp = SparseArray(self.arr.to_dense()[:0]) - tm.assert_sp_array_equal(result, exp) + @pytest.mark.parametrize( + "slc", + [ + np.s_[:], + np.s_[1:10], + np.s_[1:100], + np.s_[10:1], + np.s_[:-3], + np.s_[-5:-4], + np.s_[:-12], + np.s_[-12:], + np.s_[2:], + np.s_[2::3], + np.s_[::2], + np.s_[::-1], + np.s_[::-2], + np.s_[1:6:2], + np.s_[:-6:-2], + ], + ) + @pytest.mark.parametrize( + "as_dense", [[np.nan] * 10, [1] * 10, [np.nan] * 5 + [1] * 5, []] + ) + def test_getslice(self, slc, as_dense): + as_dense = np.array(as_dense) + arr = SparseArray(as_dense) + + result = arr[slc] + expected = SparseArray(as_dense[slc]) + + tm.assert_sp_array_equal(result, expected) def test_getslice_tuple(self): dense = np.array([np.nan, 0, 3, 4, 0, 5, np.nan, np.nan, 0])