From a28e2c62e1e807a2f9ddf1f7129e38e0f0f6efd7 Mon Sep 17 00:00:00 2001
From: Brock <jbrockmendel@gmail.com>
Date: Tue, 15 Aug 2023 14:39:29 -0700
Subject: [PATCH 1/6] REF: fix can_use_libjoin check

---
 pandas/core/indexes/base.py | 20 +++++++++++++++++---
 1 file changed, 17 insertions(+), 3 deletions(-)

diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
index e23887159c9c6..3c6241b6e0207 100644
--- a/pandas/core/indexes/base.py
+++ b/pandas/core/indexes/base.py
@@ -4611,9 +4611,23 @@ def join(
 
         _validate_join_method(how)
 
-        if (
-            not isinstance(self.dtype, CategoricalDtype)
-            and self.is_monotonic_increasing
+        if not self.is_unique and not other.is_unique:
+            return self._join_non_unique(other, how=how)
+        elif not self.is_unique or not other.is_unique:
+            if self.is_monotonic_increasing and other.is_monotonic_increasing:
+                # Note: 2023-08-15 we *do* have tests that get here with
+                #  Categorical, string[python] (can use libjoin)
+                #  and Interval (cannot)
+                if self._can_use_libjoin:
+                    # otherwise we will fall through to _join_via_get_indexer
+                    # GH#39133
+                    # go through object dtype for ea till engine is supported properly
+                    return self._join_monotonic(other, how=how)
+            else:
+                return self._join_non_unique(other, how=how)
+        elif (
+            # GH48504: exclude MultiIndex to avoid going through MultiIndex._values
+            self.is_monotonic_increasing
             and other.is_monotonic_increasing
             and self._can_use_libjoin
             and other._can_use_libjoin

From d9cfa91f70f9c3ef8bbcc4d9ce150a4091d6ff57 Mon Sep 17 00:00:00 2001
From: Brock <jbrockmendel@gmail.com>
Date: Tue, 15 Aug 2023 20:34:25 -0700
Subject: [PATCH 2/6] DOC: docstring for can_use_libjoin

---
 pandas/core/indexes/base.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
index 3c6241b6e0207..5b4131b4f9e7a 100644
--- a/pandas/core/indexes/base.py
+++ b/pandas/core/indexes/base.py
@@ -5027,7 +5027,8 @@ def _can_use_libjoin(self) -> bool:
             )
         # Exclude index types where the conversion to numpy converts to object dtype,
         #  which negates the performance benefit of libjoin
-        # TODO: exclude RangeIndex? Seems to break test_concat_datetime_timezone
+        # TODO: exclude RangeIndex (which allocated memory)?
+        #  Doing so seems to break test_concat_datetime_timezone
         return not isinstance(self, (ABCIntervalIndex, ABCMultiIndex))
 
     # --------------------------------------------------------------------
@@ -5166,7 +5167,7 @@ def _get_join_target(self) -> np.ndarray:
         # TODO: exclude ABCRangeIndex case here as it copies
         target = self._get_engine_target()
         if not isinstance(target, np.ndarray):
-            raise ValueError("_can_use_libjoin should return False.")
+            raise ValueError("_can_use_libjoin should raise in this case.")
         return target
 
     def _from_join_target(self, result: np.ndarray) -> ArrayLike:

From 851f7259fb3e6074ca567f6abdbc0ee507f77e7b Mon Sep 17 00:00:00 2001
From: Brock <jbrockmendel@gmail.com>
Date: Wed, 16 Aug 2023 11:34:35 -0700
Subject: [PATCH 3/6] Make can_use_libjoin checks more-correct

---
 pandas/core/indexes/base.py | 45 +++++++++++++------------------------
 1 file changed, 15 insertions(+), 30 deletions(-)

diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
index 5b4131b4f9e7a..7b0d207eec5c9 100644
--- a/pandas/core/indexes/base.py
+++ b/pandas/core/indexes/base.py
@@ -3334,9 +3334,7 @@ def _union(self, other: Index, sort: bool | None):
 
         if (
             sort in (None, True)
-            and self.is_monotonic_increasing
-            and other.is_monotonic_increasing
-            and not (self.has_duplicates and other.has_duplicates)
+            and (self.is_unique or other.is_unique)
             and self._can_use_libjoin
             and other._can_use_libjoin
         ):
@@ -3488,12 +3486,7 @@ def _intersection(self, other: Index, sort: bool = False):
         """
         intersection specialized to the case with matching dtypes.
         """
-        if (
-            self.is_monotonic_increasing
-            and other.is_monotonic_increasing
-            and self._can_use_libjoin
-            and other._can_use_libjoin
-        ):
+        if self._can_use_libjoin and other._can_use_libjoin:
             try:
                 res_indexer, indexer, _ = self._inner_indexer(other)
             except TypeError:
@@ -4611,24 +4604,9 @@ def join(
 
         _validate_join_method(how)
 
-        if not self.is_unique and not other.is_unique:
-            return self._join_non_unique(other, how=how)
-        elif not self.is_unique or not other.is_unique:
-            if self.is_monotonic_increasing and other.is_monotonic_increasing:
-                # Note: 2023-08-15 we *do* have tests that get here with
-                #  Categorical, string[python] (can use libjoin)
-                #  and Interval (cannot)
-                if self._can_use_libjoin:
-                    # otherwise we will fall through to _join_via_get_indexer
-                    # GH#39133
-                    # go through object dtype for ea till engine is supported properly
-                    return self._join_monotonic(other, how=how)
-            else:
-                return self._join_non_unique(other, how=how)
-        elif (
-            # GH48504: exclude MultiIndex to avoid going through MultiIndex._values
-            self.is_monotonic_increasing
-            and other.is_monotonic_increasing
+        if (
+            not isinstance(self.dtype, CategoricalDtype)
+            # test_join_with_categorical_index requires excluding CategoricalDtype
             and self._can_use_libjoin
             and other._can_use_libjoin
             and (self.is_unique or other.is_unique)
@@ -4935,7 +4913,10 @@ def _get_leaf_sorter(labels: list[np.ndarray]) -> npt.NDArray[np.intp]:
     def _join_monotonic(
         self, other: Index, how: JoinHow = "left"
     ) -> tuple[Index, npt.NDArray[np.intp] | None, npt.NDArray[np.intp] | None]:
-        # We only get here with matching dtypes and both monotonic increasing
+        # We only get here with (caller is responsible for ensuring):
+        #  1) matching dtypes
+        #  2) both monotonic increasing
+        #  3) other.is_unique or self.is_unique
         assert other.dtype == self.dtype
         assert self._can_use_libjoin and other._can_use_libjoin
 
@@ -5017,6 +4998,10 @@ def _can_use_libjoin(self) -> bool:
         making a copy. If we cannot, this negates the performance benefit
         of using libjoin.
         """
+        if not self.is_monotonic_increasing:
+            # The libjoin functions all assume monotonicity.
+            return False
+
         if type(self) is Index:
             # excludes EAs, but include masks, we get here with monotonic
             # values only, meaning no NA
@@ -5027,7 +5012,7 @@ def _can_use_libjoin(self) -> bool:
             )
         # Exclude index types where the conversion to numpy converts to object dtype,
         #  which negates the performance benefit of libjoin
-        # TODO: exclude RangeIndex (which allocated memory)?
+        # TODO: exclude RangeIndex (which allocates memory)?
         #  Doing so seems to break test_concat_datetime_timezone
         return not isinstance(self, (ABCIntervalIndex, ABCMultiIndex))
 
@@ -5167,7 +5152,7 @@ def _get_join_target(self) -> np.ndarray:
         # TODO: exclude ABCRangeIndex case here as it copies
         target = self._get_engine_target()
         if not isinstance(target, np.ndarray):
-            raise ValueError("_can_use_libjoin should raise in this case.")
+            raise ValueError("_can_use_libjoin should return False.")
         return target
 
     def _from_join_target(self, result: np.ndarray) -> ArrayLike:

From a4b702b7a9958ac043f6df677ee6dcd693aa180f Mon Sep 17 00:00:00 2001
From: Brock <jbrockmendel@gmail.com>
Date: Mon, 16 Oct 2023 18:21:29 -0700
Subject: [PATCH 4/6] avoid allocating mapping in monotonic cases

---
 pandas/_libs/index.pyx | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx
index 5c7680bc6fb6c..5455c9aa2c866 100644
--- a/pandas/_libs/index.pyx
+++ b/pandas/_libs/index.pyx
@@ -247,6 +247,10 @@ cdef class IndexEngine:
 
     @property
     def is_unique(self) -> bool:
+        # for why we check is_monotonic_increasing here, see
+        # https://github.com/pandas-dev/pandas/pull/55342#discussion_r1361405781
+        if self.need_monotonic_check:
+            self.is_monotonic_increasing
         if self.need_unique_check:
             self._do_unique_check()
 
@@ -922,6 +926,10 @@ cdef class SharedEngine:
 
     @property
     def is_unique(self) -> bool:
+        # for why we check is_monotonic_increasing here, see
+        # https://github.com/pandas-dev/pandas/pull/55342#discussion_r1361405781
+        if self.need_monotonic_check:
+            self.is_monotonic_increasing
         if self.need_unique_check:
             arr = self.values.unique()
             self.unique = len(arr) == len(self.values)

From 6bf14b5ce99eeaff750cb96b99418ad55adff946 Mon Sep 17 00:00:00 2001
From: Luke Manley <lukemanley@gmail.com>
Date: Mon, 30 Oct 2023 18:04:42 -0400
Subject: [PATCH 5/6] fix categorical memory usage tests

---
 pandas/core/frame.py                       | 2 +-
 pandas/tests/extension/test_categorical.py | 5 -----
 2 files changed, 1 insertion(+), 6 deletions(-)

diff --git a/pandas/core/frame.py b/pandas/core/frame.py
index f37be37f37693..618b84e70fb13 100644
--- a/pandas/core/frame.py
+++ b/pandas/core/frame.py
@@ -3692,7 +3692,7 @@ def memory_usage(self, index: bool = True, deep: bool = False) -> Series:
         many repeated values.
 
         >>> df['object'].astype('category').memory_usage(deep=True)
-        5244
+        5136
         """
         result = self._constructor_sliced(
             [c.memory_usage(index=False, deep=deep) for col, c in self.items()],
diff --git a/pandas/tests/extension/test_categorical.py b/pandas/tests/extension/test_categorical.py
index 5cde5df4bc007..650da67523b45 100644
--- a/pandas/tests/extension/test_categorical.py
+++ b/pandas/tests/extension/test_categorical.py
@@ -73,11 +73,6 @@ def data_for_grouping():
 
 
 class TestCategorical(base.ExtensionTests):
-    @pytest.mark.xfail(reason="Memory usage doesn't match")
-    def test_memory_usage(self, data):
-        # TODO: Is this deliberate?
-        super().test_memory_usage(data)
-
     def test_contains(self, data, data_missing):
         # GH-37867
         # na value handling in Categorical.__contains__ is deprecated.

From 81e74a9e85f292575920762d9d448520ebacd79d Mon Sep 17 00:00:00 2001
From: Luke Manley <lukemanley@gmail.com>
Date: Wed, 1 Nov 2023 07:24:11 -0400
Subject: [PATCH 6/6] catch decimal.InvalidOperation

---
 pandas/_libs/index.pyx | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx
index 5455c9aa2c866..1202898274227 100644
--- a/pandas/_libs/index.pyx
+++ b/pandas/_libs/index.pyx
@@ -42,6 +42,8 @@ from pandas._libs.missing cimport (
     is_matching_na,
 )
 
+from decimal import InvalidOperation
+
 # Defines shift of MultiIndex codes to avoid negative codes (missing values)
 multiindex_nulls_shift = 2
 
@@ -284,7 +286,7 @@ cdef class IndexEngine:
                 values = self.values
                 self.monotonic_inc, self.monotonic_dec, is_strict_monotonic = \
                     self._call_monotonic(values)
-            except TypeError:
+            except (TypeError, InvalidOperation):
                 self.monotonic_inc = 0
                 self.monotonic_dec = 0
                 is_strict_monotonic = 0