From 23c85b14c2d8a36d30e82b41424f04f5e71e7877 Mon Sep 17 00:00:00 2001 From: Jeffrey Harrison Date: Fri, 7 Mar 2025 17:04:52 -0700 Subject: [PATCH 1/7] Save original index and remap after function completes. --- pandas/core/methods/selectn.py | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/pandas/core/methods/selectn.py b/pandas/core/methods/selectn.py index 02e7445f1d275..3e10f8bad31f3 100644 --- a/pandas/core/methods/selectn.py +++ b/pandas/core/methods/selectn.py @@ -111,13 +111,20 @@ def compute(self, method: str) -> Series: if n <= 0: return self.obj[[]] - dropped = self.obj.dropna() - nan_index = self.obj.drop(dropped.index) + # Save index and reset to default index to avoid performance impact + # from when index contains duplicates + original_index = self.obj.index + cur_series = self.obj.reset_index(drop=True) + + dropped = cur_series.dropna() + nan_index = cur_series.drop(dropped.index) # slow method - if n >= len(self.obj): + if n >= len(cur_series): ascending = method == "nsmallest" - return self.obj.sort_values(ascending=ascending).head(n) + final_series = cur_series.sort_values(ascending=ascending, kind="mergesort").head(n) + final_series.index = original_index.take(final_series.index) + return final_series # fast method new_dtype = dropped.dtype @@ -173,7 +180,9 @@ def compute(self, method: str) -> Series: # reverse indices inds = narr - 1 - inds - return concat([dropped.iloc[inds], nan_index]).iloc[:findex] + final_series = concat([dropped.iloc[inds], nan_index]).iloc[:findex] + final_series.index = original_index.take(final_series.index) + return final_series class SelectNFrame(SelectN[DataFrame]): From 4c39ea37812e8931b01454055095102db6553fb3 Mon Sep 17 00:00:00 2001 From: Jeffrey Harrison Date: Thu, 13 Mar 2025 14:33:04 -0600 Subject: [PATCH 2/7] precommit passes --- pandas/core/methods/selectn.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/pandas/core/methods/selectn.py b/pandas/core/methods/selectn.py index 3e10f8bad31f3..e7fbb29d4fc49 100644 --- a/pandas/core/methods/selectn.py +++ b/pandas/core/methods/selectn.py @@ -11,6 +11,7 @@ from typing import ( TYPE_CHECKING, Generic, + Literal, cast, final, ) @@ -54,7 +55,9 @@ class SelectN(Generic[NDFrameT]): - def __init__(self, obj: NDFrameT, n: int, keep: str) -> None: + def __init__( + self, obj: NDFrameT, n: int, keep: Literal["first", "last", "all"] + ) -> None: self.obj = obj self.n = n self.keep = keep @@ -111,9 +114,9 @@ def compute(self, method: str) -> Series: if n <= 0: return self.obj[[]] - # Save index and reset to default index to avoid performance impact + # Save index and reset to default index to avoid performance impact # from when index contains duplicates - original_index = self.obj.index + original_index: Index = self.obj.index cur_series = self.obj.reset_index(drop=True) dropped = cur_series.dropna() @@ -122,7 +125,9 @@ def compute(self, method: str) -> Series: # slow method if n >= len(cur_series): ascending = method == "nsmallest" - final_series = cur_series.sort_values(ascending=ascending, kind="mergesort").head(n) + final_series = cur_series.sort_values( + ascending=ascending, kind="mergesort" + ).head(n) final_series.index = original_index.take(final_series.index) return final_series From 0c0c3b5bd4f305db6b37479604dad31c81de180b Mon Sep 17 00:00:00 2001 From: Jeffrey Harrison Date: Thu, 13 Mar 2025 15:29:23 -0600 Subject: [PATCH 3/7] use stable sorting 'mergesort' in tests --- pandas/tests/frame/methods/test_nlargest.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/frame/methods/test_nlargest.py b/pandas/tests/frame/methods/test_nlargest.py index c6e5304ae3cb4..647ef87671198 100644 --- a/pandas/tests/frame/methods/test_nlargest.py +++ b/pandas/tests/frame/methods/test_nlargest.py @@ -153,11 +153,11 @@ def test_nlargest_n_duplicate_index(self, n, order, request): index=[0, 0, 1, 1, 1], ) result = df.nsmallest(n, order) - expected = df.sort_values(order).head(n) + expected = df.sort_values(order, kind="mergesort").head(n) tm.assert_frame_equal(result, expected) result = df.nlargest(n, order) - expected = df.sort_values(order, ascending=False).head(n) + expected = df.sort_values(order, ascending=False, kind="mergesort").head(n) if Version(np.__version__) >= Version("1.25") and ( (order == ["a"] and n in (1, 2, 3, 4)) or ((order == ["a", "b"]) and n == 5) ): From cb3586fdb3a4a2eac324e2a4ec8f6c3b2c3bdc82 Mon Sep 17 00:00:00 2001 From: Jeffrey Harrison Date: Thu, 13 Mar 2025 16:03:29 -0600 Subject: [PATCH 4/7] Change sorts to `stable` instead of mergesort --- doc/source/whatsnew/v3.0.0.rst | 3 +++ pandas/core/methods/selectn.py | 10 +++++----- pandas/tests/frame/methods/test_nlargest.py | 4 ++-- 3 files changed, 10 insertions(+), 7 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 3294af742fae0..05b52f7ec950b 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -72,6 +72,7 @@ Other enhancements - :meth:`DataFrameGroupBy.transform`, :meth:`SeriesGroupBy.transform`, :meth:`DataFrameGroupBy.agg`, :meth:`SeriesGroupBy.agg`, :meth:`RollingGroupby.apply`, :meth:`ExpandingGroupby.apply`, :meth:`Rolling.apply`, :meth:`Expanding.apply`, :meth:`DataFrame.apply` with ``engine="numba"`` now supports positional arguments passed as kwargs (:issue:`58995`) - :meth:`Rolling.agg`, :meth:`Expanding.agg` and :meth:`ExponentialMovingWindow.agg` now accept :class:`NamedAgg` aggregations through ``**kwargs`` (:issue:`28333`) - :meth:`Series.map` can now accept kwargs to pass on to func (:issue:`59814`) +- :meth:`Series.nlargest` has improved performance when there are duplicate values in the index (:issue:`55767`) - :meth:`Series.str.get_dummies` now accepts a ``dtype`` parameter to specify the dtype of the resulting DataFrame (:issue:`47872`) - :meth:`pandas.concat` will raise a ``ValueError`` when ``ignore_index=True`` and ``keys`` is not ``None`` (:issue:`59274`) - :py:class:`frozenset` elements in pandas objects are now natively printed (:issue:`60690`) @@ -152,6 +153,8 @@ These improvements also fixed certain bugs in groupby: - :meth:`.DataFrameGroupBy.sum` would have incorrect values when there are multiple groupings, unobserved groups, and non-numeric data (:issue:`43891`) - :meth:`.DataFrameGroupBy.value_counts` would produce incorrect results when used with some categorical and some non-categorical groupings and ``observed=False`` (:issue:`56016`) +- :meth:`Series.nlargest` + .. _whatsnew_300.notable_bug_fixes.notable_bug_fix2: notable_bug_fix2 diff --git a/pandas/core/methods/selectn.py b/pandas/core/methods/selectn.py index e7fbb29d4fc49..544c5ce4df8ab 100644 --- a/pandas/core/methods/selectn.py +++ b/pandas/core/methods/selectn.py @@ -119,18 +119,18 @@ def compute(self, method: str) -> Series: original_index: Index = self.obj.index cur_series = self.obj.reset_index(drop=True) - dropped = cur_series.dropna() - nan_index = cur_series.drop(dropped.index) - # slow method if n >= len(cur_series): ascending = method == "nsmallest" final_series = cur_series.sort_values( - ascending=ascending, kind="mergesort" + ascending=ascending, kind="stable" ).head(n) final_series.index = original_index.take(final_series.index) return final_series + dropped = cur_series.dropna() + nan_index = cur_series.drop(dropped.index) + # fast method new_dtype = dropped.dtype @@ -291,4 +291,4 @@ def get_indexer(current_indexer: Index, other_indexer: Index) -> Index: ascending = method == "nsmallest" - return frame.sort_values(columns, ascending=ascending, kind="mergesort") + return frame.sort_values(columns, ascending=ascending, kind="stable") diff --git a/pandas/tests/frame/methods/test_nlargest.py b/pandas/tests/frame/methods/test_nlargest.py index 647ef87671198..08b7128e6ec11 100644 --- a/pandas/tests/frame/methods/test_nlargest.py +++ b/pandas/tests/frame/methods/test_nlargest.py @@ -153,11 +153,11 @@ def test_nlargest_n_duplicate_index(self, n, order, request): index=[0, 0, 1, 1, 1], ) result = df.nsmallest(n, order) - expected = df.sort_values(order, kind="mergesort").head(n) + expected = df.sort_values(order, kind="stable").head(n) tm.assert_frame_equal(result, expected) result = df.nlargest(n, order) - expected = df.sort_values(order, ascending=False, kind="mergesort").head(n) + expected = df.sort_values(order, ascending=False, kind="stable").head(n) if Version(np.__version__) >= Version("1.25") and ( (order == ["a"] and n in (1, 2, 3, 4)) or ((order == ["a", "b"]) and n == 5) ): From ba2470e3e764cb18ad0f9d6543eead9f933458cd Mon Sep 17 00:00:00 2001 From: Jeffrey Harrison Date: Fri, 14 Mar 2025 09:40:13 -0600 Subject: [PATCH 5/7] modify 'keep' to use a Literal instead of string --- pandas/core/methods/selectn.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/pandas/core/methods/selectn.py b/pandas/core/methods/selectn.py index 544c5ce4df8ab..b3bff2c5186b5 100644 --- a/pandas/core/methods/selectn.py +++ b/pandas/core/methods/selectn.py @@ -206,7 +206,13 @@ class SelectNFrame(SelectN[DataFrame]): nordered : DataFrame """ - def __init__(self, obj: DataFrame, n: int, keep: str, columns: IndexLabel) -> None: + def __init__( + self, + obj: DataFrame, + n: int, + keep: Literal["first", "last", "all"], + columns: IndexLabel, + ) -> None: super().__init__(obj, n, keep) if not is_list_like(columns) or isinstance(columns, tuple): columns = [columns] From 1fb6cc3b2c48ba1c5392d68848566a36406c0f51 Mon Sep 17 00:00:00 2001 From: Jeffrey Harrison Date: Wed, 26 Mar 2025 12:32:57 -0600 Subject: [PATCH 6/7] address comments --- pandas/core/methods/selectn.py | 31 ++++++++++++++++--------------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/pandas/core/methods/selectn.py b/pandas/core/methods/selectn.py index b3bff2c5186b5..59516b16905dc 100644 --- a/pandas/core/methods/selectn.py +++ b/pandas/core/methods/selectn.py @@ -117,21 +117,22 @@ def compute(self, method: str) -> Series: # Save index and reset to default index to avoid performance impact # from when index contains duplicates original_index: Index = self.obj.index - cur_series = self.obj.reset_index(drop=True) + default_index = self.obj.reset_index(drop=True) - # slow method - if n >= len(cur_series): + # Slower method used when taking the full length of the series + # In this case, it is equivalent to a sort. + if n >= len(default_index): ascending = method == "nsmallest" - final_series = cur_series.sort_values( - ascending=ascending, kind="stable" - ).head(n) - final_series.index = original_index.take(final_series.index) - return final_series + result = default_index.sort_values(ascending=ascending, kind="stable").head( + n + ) + result.index = original_index.take(result.index) + return result - dropped = cur_series.dropna() - nan_index = cur_series.drop(dropped.index) + # Fast method used in the general case + dropped = default_index.dropna() + nan_index = default_index.drop(dropped.index) - # fast method new_dtype = dropped.dtype # Similar to algorithms._ensure_data @@ -170,7 +171,7 @@ def compute(self, method: str) -> Series: else: kth_val = np.nan (ns,) = np.nonzero(arr <= kth_val) - inds = ns[arr[ns].argsort(kind="mergesort")] + inds = ns[arr[ns].argsort(kind="stable")] if self.keep != "all": inds = inds[:n] @@ -185,9 +186,9 @@ def compute(self, method: str) -> Series: # reverse indices inds = narr - 1 - inds - final_series = concat([dropped.iloc[inds], nan_index]).iloc[:findex] - final_series.index = original_index.take(final_series.index) - return final_series + result = concat([dropped.iloc[inds], nan_index]).iloc[:findex] + result.index = original_index.take(result.index) + return result class SelectNFrame(SelectN[DataFrame]): From 8aa2a2c12346aa967c9731129404e29b613457a9 Mon Sep 17 00:00:00 2001 From: Jeffrey Harrison Date: Wed, 26 Mar 2025 12:50:30 -0600 Subject: [PATCH 7/7] update doc to include stable sort change --- doc/source/whatsnew/v3.0.0.rst | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 05b52f7ec950b..2d74be6f503a2 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -61,6 +61,7 @@ Other enhancements - :meth:`Series.cummin` and :meth:`Series.cummax` now supports :class:`CategoricalDtype` (:issue:`52335`) - :meth:`Series.plot` now correctly handle the ``ylabel`` parameter for pie charts, allowing for explicit control over the y-axis label (:issue:`58239`) - :meth:`DataFrame.plot.scatter` argument ``c`` now accepts a column of strings, where rows with the same string are colored identically (:issue:`16827` and :issue:`16485`) +- :meth:`Series.nlargest` uses a 'stable' sort internally and will preserve original ordering. - :class:`ArrowDtype` now supports ``pyarrow.JsonType`` (:issue:`60958`) - :class:`DataFrameGroupBy` and :class:`SeriesGroupBy` methods ``sum``, ``mean``, ``median``, ``prod``, ``min``, ``max``, ``std``, ``var`` and ``sem`` now accept ``skipna`` parameter (:issue:`15675`) - :class:`Rolling` and :class:`Expanding` now support ``nunique`` (:issue:`26958`) @@ -72,7 +73,6 @@ Other enhancements - :meth:`DataFrameGroupBy.transform`, :meth:`SeriesGroupBy.transform`, :meth:`DataFrameGroupBy.agg`, :meth:`SeriesGroupBy.agg`, :meth:`RollingGroupby.apply`, :meth:`ExpandingGroupby.apply`, :meth:`Rolling.apply`, :meth:`Expanding.apply`, :meth:`DataFrame.apply` with ``engine="numba"`` now supports positional arguments passed as kwargs (:issue:`58995`) - :meth:`Rolling.agg`, :meth:`Expanding.agg` and :meth:`ExponentialMovingWindow.agg` now accept :class:`NamedAgg` aggregations through ``**kwargs`` (:issue:`28333`) - :meth:`Series.map` can now accept kwargs to pass on to func (:issue:`59814`) -- :meth:`Series.nlargest` has improved performance when there are duplicate values in the index (:issue:`55767`) - :meth:`Series.str.get_dummies` now accepts a ``dtype`` parameter to specify the dtype of the resulting DataFrame (:issue:`47872`) - :meth:`pandas.concat` will raise a ``ValueError`` when ``ignore_index=True`` and ``keys`` is not ``None`` (:issue:`59274`) - :py:class:`frozenset` elements in pandas objects are now natively printed (:issue:`60690`) @@ -153,8 +153,6 @@ These improvements also fixed certain bugs in groupby: - :meth:`.DataFrameGroupBy.sum` would have incorrect values when there are multiple groupings, unobserved groups, and non-numeric data (:issue:`43891`) - :meth:`.DataFrameGroupBy.value_counts` would produce incorrect results when used with some categorical and some non-categorical groupings and ``observed=False`` (:issue:`56016`) -- :meth:`Series.nlargest` - .. _whatsnew_300.notable_bug_fixes.notable_bug_fix2: notable_bug_fix2 @@ -596,6 +594,7 @@ Performance improvements - :func:`concat` returns a :class:`RangeIndex` column when possible when ``objs`` contains :class:`Series` and :class:`DataFrame` and ``axis=0`` (:issue:`58119`) - :func:`concat` returns a :class:`RangeIndex` level in the :class:`MultiIndex` result when ``keys`` is a ``range`` or :class:`RangeIndex` (:issue:`57542`) - :meth:`RangeIndex.append` returns a :class:`RangeIndex` instead of a :class:`Index` when appending values that could continue the :class:`RangeIndex` (:issue:`57467`) +- :meth:`Series.nlargest` has improved performance when there are duplicate values in the index (:issue:`55767`) - :meth:`Series.str.extract` returns a :class:`RangeIndex` columns instead of an :class:`Index` column when possible (:issue:`57542`) - :meth:`Series.str.partition` with :class:`ArrowDtype` returns a :class:`RangeIndex` columns instead of an :class:`Index` column when possible (:issue:`57768`) - Performance improvement in :class:`DataFrame` when ``data`` is a ``dict`` and ``columns`` is specified (:issue:`24368`)