From a28e2c62e1e807a2f9ddf1f7129e38e0f0f6efd7 Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 15 Aug 2023 14:39:29 -0700 Subject: [PATCH 1/6] REF: fix can_use_libjoin check --- pandas/core/indexes/base.py | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index e23887159c9c6..3c6241b6e0207 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -4611,9 +4611,23 @@ def join( _validate_join_method(how) - if ( - not isinstance(self.dtype, CategoricalDtype) - and self.is_monotonic_increasing + if not self.is_unique and not other.is_unique: + return self._join_non_unique(other, how=how) + elif not self.is_unique or not other.is_unique: + if self.is_monotonic_increasing and other.is_monotonic_increasing: + # Note: 2023-08-15 we *do* have tests that get here with + # Categorical, string[python] (can use libjoin) + # and Interval (cannot) + if self._can_use_libjoin: + # otherwise we will fall through to _join_via_get_indexer + # GH#39133 + # go through object dtype for ea till engine is supported properly + return self._join_monotonic(other, how=how) + else: + return self._join_non_unique(other, how=how) + elif ( + # GH48504: exclude MultiIndex to avoid going through MultiIndex._values + self.is_monotonic_increasing and other.is_monotonic_increasing and self._can_use_libjoin and other._can_use_libjoin From d9cfa91f70f9c3ef8bbcc4d9ce150a4091d6ff57 Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 15 Aug 2023 20:34:25 -0700 Subject: [PATCH 2/6] DOC: docstring for can_use_libjoin --- pandas/core/indexes/base.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 3c6241b6e0207..5b4131b4f9e7a 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -5027,7 +5027,8 @@ def _can_use_libjoin(self) -> bool: ) # Exclude index types where the conversion to numpy converts to object dtype, # which negates the performance benefit of libjoin - # TODO: exclude RangeIndex? Seems to break test_concat_datetime_timezone + # TODO: exclude RangeIndex (which allocated memory)? + # Doing so seems to break test_concat_datetime_timezone return not isinstance(self, (ABCIntervalIndex, ABCMultiIndex)) # -------------------------------------------------------------------- @@ -5166,7 +5167,7 @@ def _get_join_target(self) -> np.ndarray: # TODO: exclude ABCRangeIndex case here as it copies target = self._get_engine_target() if not isinstance(target, np.ndarray): - raise ValueError("_can_use_libjoin should return False.") + raise ValueError("_can_use_libjoin should raise in this case.") return target def _from_join_target(self, result: np.ndarray) -> ArrayLike: From 851f7259fb3e6074ca567f6abdbc0ee507f77e7b Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 16 Aug 2023 11:34:35 -0700 Subject: [PATCH 3/6] Make can_use_libjoin checks more-correct --- pandas/core/indexes/base.py | 45 +++++++++++++------------------------ 1 file changed, 15 insertions(+), 30 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 5b4131b4f9e7a..7b0d207eec5c9 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -3334,9 +3334,7 @@ def _union(self, other: Index, sort: bool | None): if ( sort in (None, True) - and self.is_monotonic_increasing - and other.is_monotonic_increasing - and not (self.has_duplicates and other.has_duplicates) + and (self.is_unique or other.is_unique) and self._can_use_libjoin and other._can_use_libjoin ): @@ -3488,12 +3486,7 @@ def _intersection(self, other: Index, sort: bool = False): """ intersection specialized to the case with matching dtypes. """ - if ( - self.is_monotonic_increasing - and other.is_monotonic_increasing - and self._can_use_libjoin - and other._can_use_libjoin - ): + if self._can_use_libjoin and other._can_use_libjoin: try: res_indexer, indexer, _ = self._inner_indexer(other) except TypeError: @@ -4611,24 +4604,9 @@ def join( _validate_join_method(how) - if not self.is_unique and not other.is_unique: - return self._join_non_unique(other, how=how) - elif not self.is_unique or not other.is_unique: - if self.is_monotonic_increasing and other.is_monotonic_increasing: - # Note: 2023-08-15 we *do* have tests that get here with - # Categorical, string[python] (can use libjoin) - # and Interval (cannot) - if self._can_use_libjoin: - # otherwise we will fall through to _join_via_get_indexer - # GH#39133 - # go through object dtype for ea till engine is supported properly - return self._join_monotonic(other, how=how) - else: - return self._join_non_unique(other, how=how) - elif ( - # GH48504: exclude MultiIndex to avoid going through MultiIndex._values - self.is_monotonic_increasing - and other.is_monotonic_increasing + if ( + not isinstance(self.dtype, CategoricalDtype) + # test_join_with_categorical_index requires excluding CategoricalDtype and self._can_use_libjoin and other._can_use_libjoin and (self.is_unique or other.is_unique) @@ -4935,7 +4913,10 @@ def _get_leaf_sorter(labels: list[np.ndarray]) -> npt.NDArray[np.intp]: def _join_monotonic( self, other: Index, how: JoinHow = "left" ) -> tuple[Index, npt.NDArray[np.intp] | None, npt.NDArray[np.intp] | None]: - # We only get here with matching dtypes and both monotonic increasing + # We only get here with (caller is responsible for ensuring): + # 1) matching dtypes + # 2) both monotonic increasing + # 3) other.is_unique or self.is_unique assert other.dtype == self.dtype assert self._can_use_libjoin and other._can_use_libjoin @@ -5017,6 +4998,10 @@ def _can_use_libjoin(self) -> bool: making a copy. If we cannot, this negates the performance benefit of using libjoin. """ + if not self.is_monotonic_increasing: + # The libjoin functions all assume monotonicity. + return False + if type(self) is Index: # excludes EAs, but include masks, we get here with monotonic # values only, meaning no NA @@ -5027,7 +5012,7 @@ def _can_use_libjoin(self) -> bool: ) # Exclude index types where the conversion to numpy converts to object dtype, # which negates the performance benefit of libjoin - # TODO: exclude RangeIndex (which allocated memory)? + # TODO: exclude RangeIndex (which allocates memory)? # Doing so seems to break test_concat_datetime_timezone return not isinstance(self, (ABCIntervalIndex, ABCMultiIndex)) @@ -5167,7 +5152,7 @@ def _get_join_target(self) -> np.ndarray: # TODO: exclude ABCRangeIndex case here as it copies target = self._get_engine_target() if not isinstance(target, np.ndarray): - raise ValueError("_can_use_libjoin should raise in this case.") + raise ValueError("_can_use_libjoin should return False.") return target def _from_join_target(self, result: np.ndarray) -> ArrayLike: From a4b702b7a9958ac043f6df677ee6dcd693aa180f Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 16 Oct 2023 18:21:29 -0700 Subject: [PATCH 4/6] avoid allocating mapping in monotonic cases --- pandas/_libs/index.pyx | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index 5c7680bc6fb6c..5455c9aa2c866 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -247,6 +247,10 @@ cdef class IndexEngine: @property def is_unique(self) -> bool: + # for why we check is_monotonic_increasing here, see + # https://github.com/pandas-dev/pandas/pull/55342#discussion_r1361405781 + if self.need_monotonic_check: + self.is_monotonic_increasing if self.need_unique_check: self._do_unique_check() @@ -922,6 +926,10 @@ cdef class SharedEngine: @property def is_unique(self) -> bool: + # for why we check is_monotonic_increasing here, see + # https://github.com/pandas-dev/pandas/pull/55342#discussion_r1361405781 + if self.need_monotonic_check: + self.is_monotonic_increasing if self.need_unique_check: arr = self.values.unique() self.unique = len(arr) == len(self.values) From 6bf14b5ce99eeaff750cb96b99418ad55adff946 Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Mon, 30 Oct 2023 18:04:42 -0400 Subject: [PATCH 5/6] fix categorical memory usage tests --- pandas/core/frame.py | 2 +- pandas/tests/extension/test_categorical.py | 5 ----- 2 files changed, 1 insertion(+), 6 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index f37be37f37693..618b84e70fb13 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3692,7 +3692,7 @@ def memory_usage(self, index: bool = True, deep: bool = False) -> Series: many repeated values. >>> df['object'].astype('category').memory_usage(deep=True) - 5244 + 5136 """ result = self._constructor_sliced( [c.memory_usage(index=False, deep=deep) for col, c in self.items()], diff --git a/pandas/tests/extension/test_categorical.py b/pandas/tests/extension/test_categorical.py index 5cde5df4bc007..650da67523b45 100644 --- a/pandas/tests/extension/test_categorical.py +++ b/pandas/tests/extension/test_categorical.py @@ -73,11 +73,6 @@ def data_for_grouping(): class TestCategorical(base.ExtensionTests): - @pytest.mark.xfail(reason="Memory usage doesn't match") - def test_memory_usage(self, data): - # TODO: Is this deliberate? - super().test_memory_usage(data) - def test_contains(self, data, data_missing): # GH-37867 # na value handling in Categorical.__contains__ is deprecated. From 81e74a9e85f292575920762d9d448520ebacd79d Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Wed, 1 Nov 2023 07:24:11 -0400 Subject: [PATCH 6/6] catch decimal.InvalidOperation --- pandas/_libs/index.pyx | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index 5455c9aa2c866..1202898274227 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -42,6 +42,8 @@ from pandas._libs.missing cimport ( is_matching_na, ) +from decimal import InvalidOperation + # Defines shift of MultiIndex codes to avoid negative codes (missing values) multiindex_nulls_shift = 2 @@ -284,7 +286,7 @@ cdef class IndexEngine: values = self.values self.monotonic_inc, self.monotonic_dec, is_strict_monotonic = \ self._call_monotonic(values) - except TypeError: + except (TypeError, InvalidOperation): self.monotonic_inc = 0 self.monotonic_dec = 0 is_strict_monotonic = 0