Skip to content

Commit 109ee11

Browse files
authored
BUG: stabilize sort_values algorithms for Series and time-like Indices (#37310)
1 parent 5a66348 commit 109ee11

File tree

19 files changed

+69
-107
lines changed

19 files changed

+69
-107
lines changed

doc/source/whatsnew/v1.2.0.rst

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -309,6 +309,13 @@ Optional libraries below the lowest tested version may still work, but are not c
309309

310310
See :ref:`install.dependencies` and :ref:`install.optional_dependencies` for more.
311311

312+
.. _whatsnew_200.api.other:
313+
314+
Other API changes
315+
^^^^^^^^^^^^^^^^^
316+
317+
- Sorting in descending order is now stable for :meth:`Series.sort_values` and :meth:`Index.sort_values` for DateTime-like :class:`Index` subclasses. This will affect sort order when sorting :class:`DataFrame` on multiple columns, sorting with a key function that produces duplicates, or requesting the sorting index when using :meth:`Index.sort_values`. When using :meth:`Series.value_counts`, count of missing values is no longer the last in the list of duplicate counts, and its position corresponds to the position in the original :class:`Series`. When using :meth:`Index.sort_values` for DateTime-like :class:`Index` subclasses, NaTs ignored the ``na_position`` argument and were sorted to the beggining. Now they respect ``na_position``, the default being ``last``, same as other :class:`Index` subclasses. (:issue:`35992`)
318+
312319
.. ---------------------------------------------------------------------------
313320
314321
.. _whatsnew_120.deprecations:

pandas/core/algorithms.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1181,10 +1181,8 @@ def compute(self, method: str) -> Series:
11811181

11821182
# slow method
11831183
if n >= len(self.obj):
1184-
reverse_it = self.keep == "last" or method == "nlargest"
11851184
ascending = method == "nsmallest"
1186-
slc = np.s_[::-1] if reverse_it else np.s_[:]
1187-
return dropped[slc].sort_values(ascending=ascending).head(n)
1185+
return dropped.sort_values(ascending=ascending).head(n)
11881186

11891187
# fast method
11901188
arr, pandas_dtype = _ensure_data(dropped.values)

pandas/core/base.py

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -933,9 +933,9 @@ def value_counts(
933933
>>> index = pd.Index([3, 1, 2, 3, 4, np.nan])
934934
>>> index.value_counts()
935935
3.0 2
936-
4.0 1
937-
2.0 1
938936
1.0 1
937+
2.0 1
938+
4.0 1
939939
dtype: int64
940940
941941
With `normalize` set to `True`, returns the relative frequency by
@@ -944,9 +944,9 @@ def value_counts(
944944
>>> s = pd.Series([3, 1, 2, 3, 4, np.nan])
945945
>>> s.value_counts(normalize=True)
946946
3.0 0.4
947-
4.0 0.2
948-
2.0 0.2
949947
1.0 0.2
948+
2.0 0.2
949+
4.0 0.2
950950
dtype: float64
951951
952952
**bins**
@@ -957,8 +957,8 @@ def value_counts(
957957
number of half-open bins.
958958
959959
>>> s.value_counts(bins=3)
960-
(2.0, 3.0] 2
961960
(0.996, 2.0] 2
961+
(2.0, 3.0] 2
962962
(3.0, 4.0] 1
963963
dtype: int64
964964
@@ -968,10 +968,10 @@ def value_counts(
968968
969969
>>> s.value_counts(dropna=False)
970970
3.0 2
971-
NaN 1
972-
4.0 1
973-
2.0 1
974971
1.0 1
972+
2.0 1
973+
4.0 1
974+
NaN 1
975975
dtype: int64
976976
"""
977977
result = value_counts(

pandas/core/frame.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5563,8 +5563,8 @@ def value_counts(
55635563
>>> df.value_counts()
55645564
num_legs num_wings
55655565
4 0 2
5566-
6 0 1
55675566
2 2 1
5567+
6 0 1
55685568
dtype: int64
55695569
55705570
>>> df.value_counts(sort=False)
@@ -5584,8 +5584,8 @@ def value_counts(
55845584
>>> df.value_counts(normalize=True)
55855585
num_legs num_wings
55865586
4 0 0.50
5587-
6 0 0.25
55885587
2 2 0.25
5588+
6 0 0.25
55895589
dtype: float64
55905590
"""
55915591
if subset is None:

pandas/core/generic.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10106,7 +10106,7 @@ def describe(
1010610106
categorical
1010710107
count 3
1010810108
unique 3
10109-
top f
10109+
top d
1011010110
freq 1
1011110111
1011210112
Excluding numeric columns from a ``DataFrame`` description.

pandas/core/indexes/base.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4542,9 +4542,7 @@ def sort_values(
45424542

45434543
# GH 35584. Sort missing values according to na_position kwarg
45444544
# ignore na_position for MultiIndex
4545-
if not isinstance(
4546-
self, (ABCMultiIndex, ABCDatetimeIndex, ABCTimedeltaIndex, ABCPeriodIndex)
4547-
):
4545+
if not isinstance(self, ABCMultiIndex):
45484546
_as = nargsort(
45494547
items=idx, ascending=ascending, na_position=na_position, key=key
45504548
)

pandas/core/series.py

Lines changed: 9 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -92,7 +92,7 @@
9292
from pandas.core.indexing import check_bool_indexer
9393
from pandas.core.internals import SingleBlockManager
9494
from pandas.core.shared_docs import _shared_docs
95-
from pandas.core.sorting import ensure_key_mapped
95+
from pandas.core.sorting import ensure_key_mapped, nargsort
9696
from pandas.core.strings import StringMethods
9797
from pandas.core.tools.datetimes import to_datetime
9898

@@ -3288,29 +3288,6 @@ def sort_values(
32883288
"sort in-place you must create a copy"
32893289
)
32903290

3291-
def _try_kind_sort(arr):
3292-
arr = ensure_key_mapped(arr, key)
3293-
arr = getattr(arr, "_values", arr)
3294-
3295-
# easier to ask forgiveness than permission
3296-
try:
3297-
# if kind==mergesort, it can fail for object dtype
3298-
return arr.argsort(kind=kind)
3299-
except TypeError:
3300-
# stable sort not available for object dtype
3301-
# uses the argsort default quicksort
3302-
return arr.argsort(kind="quicksort")
3303-
3304-
arr = self._values
3305-
sorted_index = np.empty(len(self), dtype=np.int32)
3306-
3307-
bad = isna(arr)
3308-
3309-
good = ~bad
3310-
idx = ibase.default_index(len(self))
3311-
3312-
argsorted = _try_kind_sort(self[good])
3313-
33143291
if is_list_like(ascending):
33153292
if len(ascending) != 1:
33163293
raise ValueError(
@@ -3321,21 +3298,16 @@ def _try_kind_sort(arr):
33213298
if not is_bool(ascending):
33223299
raise ValueError("ascending must be boolean")
33233300

3324-
if not ascending:
3325-
argsorted = argsorted[::-1]
3326-
3327-
if na_position == "last":
3328-
n = good.sum()
3329-
sorted_index[:n] = idx[good][argsorted]
3330-
sorted_index[n:] = idx[bad]
3331-
elif na_position == "first":
3332-
n = bad.sum()
3333-
sorted_index[n:] = idx[good][argsorted]
3334-
sorted_index[:n] = idx[bad]
3335-
else:
3301+
if na_position not in ["first", "last"]:
33363302
raise ValueError(f"invalid na_position: {na_position}")
33373303

3338-
result = self._constructor(arr[sorted_index], index=self.index[sorted_index])
3304+
# GH 35922. Make sorting stable by leveraging nargsort
3305+
values_to_sort = ensure_key_mapped(self, key)._values if key else self._values
3306+
sorted_index = nargsort(values_to_sort, kind, ascending, na_position)
3307+
3308+
result = self._constructor(
3309+
self._values[sorted_index], index=self.index[sorted_index]
3310+
)
33393311

33403312
if ignore_index:
33413313
result.index = ibase.default_index(len(sorted_index))

pandas/tests/arrays/boolean/test_function.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -77,11 +77,11 @@ def test_ufunc_reduce_raises(values):
7777
def test_value_counts_na():
7878
arr = pd.array([True, False, pd.NA], dtype="boolean")
7979
result = arr.value_counts(dropna=False)
80-
expected = pd.Series([1, 1, 1], index=[True, False, pd.NA], dtype="Int64")
80+
expected = pd.Series([1, 1, 1], index=[False, True, pd.NA], dtype="Int64")
8181
tm.assert_series_equal(result, expected)
8282

8383
result = arr.value_counts(dropna=True)
84-
expected = pd.Series([1, 1], index=[True, False], dtype="Int64")
84+
expected = pd.Series([1, 1], index=[False, True], dtype="Int64")
8585
tm.assert_series_equal(result, expected)
8686

8787

pandas/tests/arrays/string_/test_string.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -301,7 +301,7 @@ def test_arrow_roundtrip():
301301
def test_value_counts_na():
302302
arr = pd.array(["a", "b", "a", pd.NA], dtype="string")
303303
result = arr.value_counts(dropna=False)
304-
expected = pd.Series([2, 1, 1], index=["a", "b", pd.NA], dtype="Int64")
304+
expected = pd.Series([2, 1, 1], index=["a", pd.NA, "b"], dtype="Int64")
305305
tm.assert_series_equal(result, expected)
306306

307307
result = arr.value_counts(dropna=True)

pandas/tests/base/test_value_counts.py

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -153,16 +153,16 @@ def test_value_counts_bins(index_or_series):
153153
# these return the same
154154
res4 = s1.value_counts(bins=4, dropna=True)
155155
intervals = IntervalIndex.from_breaks([0.997, 1.5, 2.0, 2.5, 3.0])
156-
exp4 = Series([2, 1, 1, 0], index=intervals.take([0, 3, 1, 2]))
156+
exp4 = Series([2, 1, 1, 0], index=intervals.take([0, 1, 3, 2]))
157157
tm.assert_series_equal(res4, exp4)
158158

159159
res4 = s1.value_counts(bins=4, dropna=False)
160160
intervals = IntervalIndex.from_breaks([0.997, 1.5, 2.0, 2.5, 3.0])
161-
exp4 = Series([2, 1, 1, 0], index=intervals.take([0, 3, 1, 2]))
161+
exp4 = Series([2, 1, 1, 0], index=intervals.take([0, 1, 3, 2]))
162162
tm.assert_series_equal(res4, exp4)
163163

164164
res4n = s1.value_counts(bins=4, normalize=True)
165-
exp4n = Series([0.5, 0.25, 0.25, 0], index=intervals.take([0, 3, 1, 2]))
165+
exp4n = Series([0.5, 0.25, 0.25, 0], index=intervals.take([0, 1, 3, 2]))
166166
tm.assert_series_equal(res4n, exp4n)
167167

168168
# handle NA's properly
@@ -239,7 +239,11 @@ def test_value_counts_datetime64(index_or_series):
239239
tm.assert_series_equal(result, expected_s)
240240

241241
result = s.value_counts(dropna=False)
242-
expected_s[pd.NaT] = 1
242+
# GH 35922. NaN-like now sorts to the beginning of duplicate counts
243+
idx = pd.to_datetime(
244+
["2010-01-01 00:00:00", "2008-09-09 00:00:00", pd.NaT, "2009-01-01 00:00:00"]
245+
)
246+
expected_s = Series([3, 2, 1, 1], index=idx)
243247
tm.assert_series_equal(result, expected_s)
244248

245249
unique = s.unique()

0 commit comments

Comments
 (0)