diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 249f08c7e387b..1b8864809975f 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -178,6 +178,7 @@ Performance improvements - Performance improvement in :func:`concat` with ``axis=1`` and objects with unaligned indexes (:issue:`55084`) - Performance improvement in :func:`to_dict` on converting DataFrame to dictionary (:issue:`50990`) - Performance improvement in :meth:`DataFrame.sort_index` and :meth:`Series.sort_index` when indexed by a :class:`MultiIndex` (:issue:`54835`) +- Performance improvement in :meth:`Index.difference` (:issue:`55108`) - Performance improvement when indexing with more than 4 keys (:issue:`54550`) - diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index cd55997ad5f69..8756bb3f3c81b 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -3615,21 +3615,10 @@ def difference(self, other, sort=None): def _difference(self, other, sort): # overridden by RangeIndex - - this = self.unique() - - indexer = this.get_indexer_for(other) - indexer = indexer.take((indexer != -1).nonzero()[0]) - - label_diff = np.setdiff1d(np.arange(this.size), indexer, assume_unique=True) - - the_diff: MultiIndex | ArrayLike - if isinstance(this, ABCMultiIndex): - the_diff = this.take(label_diff) - else: - the_diff = this._values.take(label_diff) + other = other.unique() + the_diff = self[other.get_indexer_for(self) == -1] + the_diff = the_diff if self.is_unique else the_diff.unique() the_diff = _maybe_try_sort(the_diff, sort) - return the_diff def _wrap_difference_result(self, other, result): diff --git a/pandas/tests/indexes/datetimes/test_setops.py b/pandas/tests/indexes/datetimes/test_setops.py index 2e7b38abf4212..b56bad7f2e833 100644 --- a/pandas/tests/indexes/datetimes/test_setops.py +++ b/pandas/tests/indexes/datetimes/test_setops.py @@ -343,9 +343,11 @@ def test_difference_freq(self, sort): tm.assert_index_equal(idx_diff, expected) tm.assert_attr_equal("freq", idx_diff, expected) + # preserve frequency when the difference is a contiguous + # subset of the original range other = date_range("20160922", "20160925", freq="D") idx_diff = index.difference(other, sort) - expected = DatetimeIndex(["20160920", "20160921"], freq=None) + expected = DatetimeIndex(["20160920", "20160921"], freq="D") tm.assert_index_equal(idx_diff, expected) tm.assert_attr_equal("freq", idx_diff, expected) diff --git a/pandas/tests/indexes/timedeltas/test_setops.py b/pandas/tests/indexes/timedeltas/test_setops.py index cb6dce1e7ad80..6cdd6944e90ea 100644 --- a/pandas/tests/indexes/timedeltas/test_setops.py +++ b/pandas/tests/indexes/timedeltas/test_setops.py @@ -219,9 +219,11 @@ def test_difference_freq(self, sort): tm.assert_index_equal(idx_diff, expected) tm.assert_attr_equal("freq", idx_diff, expected) + # preserve frequency when the difference is a contiguous + # subset of the original range other = timedelta_range("2 days", "5 days", freq="D") idx_diff = index.difference(other, sort) - expected = TimedeltaIndex(["0 days", "1 days"], freq=None) + expected = TimedeltaIndex(["0 days", "1 days"], freq="D") tm.assert_index_equal(idx_diff, expected) tm.assert_attr_equal("freq", idx_diff, expected)