From ce5a5c432679b6d6268a2982aa11c668e27bf01b Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Tue, 21 Sep 2021 14:04:54 -0700 Subject: [PATCH 1/6] wip --- pandas/core/arrays/sparse/array.py | 26 +++++++++++++++++--------- pandas/core/internals/blocks.py | 3 +++ 2 files changed, 20 insertions(+), 9 deletions(-) diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index 8e92114d7b3de..06f46e3c62bf1 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -892,13 +892,21 @@ def __getitem__( elif isinstance(key, tuple): data_slice = self.to_dense()[key] elif isinstance(key, slice): - # special case to preserve dtypes - if key == slice(None): - return self.copy() - # TODO: this logic is surely elsewhere - # TODO: this could be more efficient - indices = np.arange(len(self), dtype=np.int32)[key] - return self.take(indices) + start = 0 if key.start is None else key.start + if start < 0: + start += len(self) + + end = len(self) if key.step is None else key.stop + if end < 0: + end += len(self) + + indices = self.sp_index.to_int_index().indices + + keep_inds = np.flatnonzero((indices >= start) & (indices <= end)) + sp_vals = self.sp_values[keep_inds] + sp_index = indices[keep_inds] + new_sp_index = make_sparse_index(len(self), sp_index, self.kind) + return type(self)._simple_new(sp_vals, new_sp_index, self.dtype) else: # TODO: I think we can avoid densifying when masking a # boolean SparseArray with another. Need to look at the @@ -1762,10 +1770,10 @@ def make_sparse_index(length: int, indices, kind: Literal["integer"]) -> IntInde def make_sparse_index(length: int, indices, kind: SparseIndexKind) -> SparseIndex: index: SparseIndex - if kind == "block" or isinstance(kind, BlockIndex): + if kind == "block": locs, lens = splib.get_blocks(indices) index = BlockIndex(length, locs, lens) - elif kind == "integer" or isinstance(kind, IntIndex): + elif kind == "integer": index = IntIndex(length, indices) else: # pragma: no cover raise ValueError("must be block or integer type") diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index da7ffbf08c34b..32d32a200efff 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -2082,6 +2082,9 @@ def to_native_types( """convert to our native types format""" values = ensure_wrapped_if_datetimelike(values) + if is_sparse(values): + values = values.to_dense() + if isinstance(values, (DatetimeArray, TimedeltaArray)): result = values._format_native_types(na_rep=na_rep, **kwargs) result = result.astype(object, copy=False) From 9a641afe5b769cbbd274a865828ff007d450ca72 Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Mon, 27 Sep 2021 20:17:15 -0700 Subject: [PATCH 2/6] WIP --- doc/source/whatsnew/v1.4.0.rst | 1 + pandas/core/arrays/sparse/array.py | 44 +++++++++++++++------- pandas/tests/arrays/sparse/test_array.py | 48 +++++++++++++++--------- 3 files changed, 63 insertions(+), 30 deletions(-) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 7b10a0f39bdbd..6dcca6eaeacf9 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -354,6 +354,7 @@ Performance improvements - Performance improvement in indexing with a :class:`MultiIndex` indexer on another :class:`MultiIndex` (:issue:43370`) - Performance improvement in :meth:`GroupBy.quantile` (:issue:`43469`) - :meth:`SparseArray.min` and :meth:`SparseArray.max` no longer require converting to a dense array (:issue:`43526`) +- Indexing into a :class:`SparseArray` with a ``slice`` with ``step=1`` no longer requires converting to a dense array (:issue:`?`) - .. --------------------------------------------------------------------------- diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index 06f46e3c62bf1..9ff279d3ca99c 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -892,21 +892,39 @@ def __getitem__( elif isinstance(key, tuple): data_slice = self.to_dense()[key] elif isinstance(key, slice): - start = 0 if key.start is None else key.start - if start < 0: - start += len(self) - end = len(self) if key.step is None else key.stop - if end < 0: - end += len(self) - - indices = self.sp_index.to_int_index().indices + # Avoid densifying when handling contiguous slices + if key.step is None or key.step == 1: + start = 0 if key.start is None else key.start + if start < 0: + start += len(self) + + end = len(self) if key.stop is None else key.stop + if end < 0: + end += len(self) + + indices = self.sp_index.to_int_index().indices + keep_inds = np.flatnonzero((indices >= start) & (indices < end)) + sp_vals = self.sp_values[keep_inds] + + sp_index = indices[keep_inds].copy() + + # If we've sliced to not include the start of the array, all our indices + # should be shifted. NB: here we are careful to also not shift by a + # negative value for a case like [0, 1][-100:] where the start index + # should be treated like 0 + if start > 0: + sp_index -= start + + # Length of our result should match applying this slice to a range + # of the length of our original array + new_len = len(range(len(self))[key]) + new_sp_index = make_sparse_index(new_len, sp_index, self.kind) + return type(self)._simple_new(sp_vals, new_sp_index, self.dtype) + else: + indices = np.arange(len(self), dtype=np.int32)[key] + return self.take(indices) - keep_inds = np.flatnonzero((indices >= start) & (indices <= end)) - sp_vals = self.sp_values[keep_inds] - sp_index = indices[keep_inds] - new_sp_index = make_sparse_index(len(self), sp_index, self.kind) - return type(self)._simple_new(sp_vals, new_sp_index, self.dtype) else: # TODO: I think we can avoid densifying when masking a # boolean SparseArray with another. Need to look at the diff --git a/pandas/tests/arrays/sparse/test_array.py b/pandas/tests/arrays/sparse/test_array.py index 8c64c5bb3a055..e4d85b138bb69 100644 --- a/pandas/tests/arrays/sparse/test_array.py +++ b/pandas/tests/arrays/sparse/test_array.py @@ -679,23 +679,37 @@ def test_getitem_arraylike_mask(self): expected = SparseArray([0, 2]) tm.assert_sp_array_equal(result, expected) - def test_getslice(self): - result = self.arr[:-3] - exp = SparseArray(self.arr.to_dense()[:-3]) - tm.assert_sp_array_equal(result, exp) - - result = self.arr[-4:] - exp = SparseArray(self.arr.to_dense()[-4:]) - tm.assert_sp_array_equal(result, exp) - - # two corner cases from Series - result = self.arr[-12:] - exp = SparseArray(self.arr) - tm.assert_sp_array_equal(result, exp) - - result = self.arr[:-12] - exp = SparseArray(self.arr.to_dense()[:0]) - tm.assert_sp_array_equal(result, exp) + @pytest.mark.parametrize( + "slc", + [ + np.s_[:], + np.s_[1:10], + np.s_[1:100], + np.s_[10:1], + np.s_[:-3], + np.s_[-5:-4], + np.s_[:-12], + np.s_[-12:], + np.s_[2:], + np.s_[2::3], + np.s_[::2], + np.s_[::-1], + np.s_[::-2], + np.s_[1:6:2], + np.s_[:-6:-2], + ], + ) + @pytest.mark.parametrize( + "as_dense", [[np.nan] * 10, [1] * 10, [np.nan] * 5 + [1] * 5, []] + ) + def test_getslice(self, slc, as_dense): + as_dense = np.array(as_dense) + arr = SparseArray(as_dense) + + result = arr[slc] + expected = SparseArray(as_dense[slc]) + + tm.assert_sp_array_equal(result, expected) def test_getslice_tuple(self): dense = np.array([np.nan, 0, 3, 4, 0, 5, np.nan, np.nan, 0]) From 79cebfcbfa8ef2d5c8cc3247fd34a6dbf2d1ddad Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Mon, 27 Sep 2021 20:39:56 -0700 Subject: [PATCH 3/6] Add benchmarks --- asv_bench/benchmarks/sparse.py | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/asv_bench/benchmarks/sparse.py b/asv_bench/benchmarks/sparse.py index ff1c4c92fe551..80587bcad4c6b 100644 --- a/asv_bench/benchmarks/sparse.py +++ b/asv_bench/benchmarks/sparse.py @@ -9,6 +9,8 @@ ) from pandas.arrays import SparseArray +from .pandas_vb_common import BaseIO + def make_array(size, dense_proportion, fill_value, dtype): dense_size = int(size * dense_proportion) @@ -105,6 +107,22 @@ def time_to_coo(self): self.df.sparse.to_coo() +class ToCSV(BaseIO): + fname = "__test__.csv" + + def setup(self): + N = 500_000 + sp_arr = SparseArray(make_array(N, 1e-5, np.nan, np.float64)) + self.ser = Series(sp_arr) + self.df = pd.DataFrame({"A": self.ser, "B": self.ser.copy()}) + + def time_to_csv_series(self): + self.ser.to_csv(self.fname) + + def time_to_csv_frame(self): + self.df.to_csv(self.fname) + + class Arithmetic: params = ([0.1, 0.01], [0, np.nan]) @@ -195,4 +213,17 @@ def time_take(self, indices, allow_fill): self.sp_arr.take(indices, allow_fill=allow_fill) +class GetItem: + def setup(self): + N = 1_000_000 + arr = make_array(N, 1e-5, np.nan, np.float64) + self.sp_arr = SparseArray(arr) + + def time_integer_indexing(self): + self.sp_arr[78] + + def time_slice(self): + self.sp_arr[1:] + + from .pandas_vb_common import setup # noqa: F401 isort:skip From 2182afa32209c3d1102e18b193a590a10fcc739d Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Mon, 27 Sep 2021 20:42:37 -0700 Subject: [PATCH 4/6] Fix accidental commit --- pandas/core/internals/blocks.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 3fdc6920bdaa9..002473a1a5fb2 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -2078,9 +2078,6 @@ def to_native_types( """convert to our native types format""" values = ensure_wrapped_if_datetimelike(values) - if is_sparse(values): - values = values.to_dense() - if isinstance(values, (DatetimeArray, TimedeltaArray)): result = values._format_native_types(na_rep=na_rep, **kwargs) result = result.astype(object, copy=False) From 5622f636b1e677fd4eb5a0add8a807bfd99312bd Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Mon, 27 Sep 2021 20:47:35 -0700 Subject: [PATCH 5/6] Remove slow benchmark to update later --- asv_bench/benchmarks/sparse.py | 18 ------------------ 1 file changed, 18 deletions(-) diff --git a/asv_bench/benchmarks/sparse.py b/asv_bench/benchmarks/sparse.py index 80587bcad4c6b..8f424685dfbae 100644 --- a/asv_bench/benchmarks/sparse.py +++ b/asv_bench/benchmarks/sparse.py @@ -9,8 +9,6 @@ ) from pandas.arrays import SparseArray -from .pandas_vb_common import BaseIO - def make_array(size, dense_proportion, fill_value, dtype): dense_size = int(size * dense_proportion) @@ -107,22 +105,6 @@ def time_to_coo(self): self.df.sparse.to_coo() -class ToCSV(BaseIO): - fname = "__test__.csv" - - def setup(self): - N = 500_000 - sp_arr = SparseArray(make_array(N, 1e-5, np.nan, np.float64)) - self.ser = Series(sp_arr) - self.df = pd.DataFrame({"A": self.ser, "B": self.ser.copy()}) - - def time_to_csv_series(self): - self.ser.to_csv(self.fname) - - def time_to_csv_frame(self): - self.df.to_csv(self.fname) - - class Arithmetic: params = ([0.1, 0.01], [0, np.nan]) From f393d0c46fb2c11be0447d2e7a77068dd9f50715 Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Mon, 27 Sep 2021 20:52:59 -0700 Subject: [PATCH 6/6] Add pr number --- doc/source/whatsnew/v1.4.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 9c6bf37888b2d..d65d21878a584 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -357,7 +357,7 @@ Performance improvements - Performance improvement in indexing with a :class:`MultiIndex` indexer on another :class:`MultiIndex` (:issue:43370`) - Performance improvement in :meth:`GroupBy.quantile` (:issue:`43469`) - :meth:`SparseArray.min` and :meth:`SparseArray.max` no longer require converting to a dense array (:issue:`43526`) -- Indexing into a :class:`SparseArray` with a ``slice`` with ``step=1`` no longer requires converting to a dense array (:issue:`?`) +- Indexing into a :class:`SparseArray` with a ``slice`` with ``step=1`` no longer requires converting to a dense array (:issue:`43777`) - Performance improvement in :meth:`SparseArray.take` with ``allow_fill=False`` (:issue:`43654`) - Performance improvement in :meth:`.Rolling.mean` and :meth:`.Expanding.mean` with ``engine="numba"`` (:issue:`43612`) -