API: ignore empty range/object dtype in Index setop operations (string dtype compat) (pandas-dev#60797)

jorisvandenbossche · jorisvandenbossche · commit 55f59e97cb1f · 2025-02-17T10:36:50.000+01:00
(cherry picked from commit ee06e71)
diff --git a/doc/source/whatsnew/v2.3.0.rst b/doc/source/whatsnew/v2.3.0.rst
@@ -59,6 +59,16 @@ Increased minimum version for Python
 
 pandas 2.3.0 supports Python 3.10 and higher.
 
+.. _whatsnew_230.api_changes:
+
+API changes
+~~~~~~~~~~~
+
+- When enabling the ``future.infer_string`` option: Index set operations (like
+  union or intersection) will now ignore the dtype of an empty ``RangeIndex`` or
+  empty ``Index`` with object dtype when determining the dtype of the resulting
+  Index (:issue:`60797`)
+
 .. ---------------------------------------------------------------------------
 .. _whatsnew_230.deprecations:
 
diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
@@ -6387,6 +6387,24 @@ def _find_common_type_compat(self, target) -> DtypeObj:
         """
         target_dtype, _ = infer_dtype_from(target)
 
+        if using_string_dtype():
+            # special case: if left or right is a zero-length RangeIndex or
+            # Index[object], those can be created by the default empty constructors
+            # -> for that case ignore this dtype and always return the other
+            # (https://github.com/pandas-dev/pandas/pull/60797)
+            from pandas.core.indexes.range import RangeIndex
+
+            if len(self) == 0 and (
+                isinstance(self, RangeIndex) or self.dtype == np.object_
+            ):
+                return target_dtype
+            if (
+                isinstance(target, Index)
+                and len(target) == 0
+                and (isinstance(target, RangeIndex) or target_dtype == np.object_)
+            ):
+                return self.dtype
+
         # special case: if one dtype is uint64 and the other a signed int, return object
         # See https://github.com/pandas-dev/pandas/issues/26778 for discussion
         # Now it's:
@@ -7005,6 +7023,14 @@ def insert(self, loc: int, item) -> Index:
 
         arr = self._values
 
+        if using_string_dtype() and len(self) == 0 and self.dtype == np.object_:
+            # special case: if we are an empty object-dtype Index, also
+            # take into account the inserted item for the resulting dtype
+            # (https://github.com/pandas-dev/pandas/pull/60797)
+            dtype = self._find_common_type_compat(item)
+            if dtype != self.dtype:
+                return self.astype(dtype).insert(loc, item)
+
         try:
             if isinstance(arr, ExtensionArray):
                 res_values = arr.insert(loc, item)
diff --git a/pandas/tests/frame/constructors/test_from_dict.py b/pandas/tests/frame/constructors/test_from_dict.py
@@ -3,8 +3,6 @@
 import numpy as np
 import pytest
 
-from pandas._config import using_string_dtype
-
 from pandas import (
     DataFrame,
     Index,
@@ -44,7 +42,6 @@ def test_constructor_single_row(self):
         )
         tm.assert_frame_equal(result, expected)
 
-    @pytest.mark.xfail(using_string_dtype(), reason="columns inferring logic broken")
     def test_constructor_list_of_series(self):
         data = [
             OrderedDict([["a", 1.5], ["b", 3.0], ["c", 4.0]]),
diff --git a/pandas/tests/frame/indexing/test_coercion.py b/pandas/tests/frame/indexing/test_coercion.py
@@ -103,12 +103,7 @@ def test_26395(indexer_al):
     df["D"] = 0
 
     indexer_al(df)["C", "D"] = 2
-    expected = DataFrame(
-        {"D": [0, 0, 2]},
-        index=["A", "B", "C"],
-        columns=pd.Index(["D"], dtype=object),
-        dtype=np.int64,
-    )
+    expected = DataFrame({"D": [0, 0, 2]}, index=["A", "B", "C"], dtype=np.int64)
     tm.assert_frame_equal(df, expected)
 
     with tm.assert_produces_warning(
diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py
@@ -1206,7 +1206,7 @@ def test_loc_setitem_datetimelike_with_inference(self):
         result = df.dtypes
         expected = Series(
             [np.dtype("timedelta64[ns]")] * 6 + [np.dtype("datetime64[ns]")] * 2,
-            index=Index(list("ABCDEFGH"), dtype=object),
+            index=list("ABCDEFGH"),
         )
         tm.assert_series_equal(result, expected)
 
diff --git a/pandas/tests/frame/indexing/test_insert.py b/pandas/tests/frame/indexing/test_insert.py
@@ -67,8 +67,7 @@ def test_insert_with_columns_dups(self):
         df.insert(0, "A", ["d", "e", "f"], allow_duplicates=True)
         df.insert(0, "A", ["a", "b", "c"], allow_duplicates=True)
         exp = DataFrame(
-            [["a", "d", "g"], ["b", "e", "h"], ["c", "f", "i"]],
-            columns=Index(["A", "A", "A"], dtype=object),
+            [["a", "d", "g"], ["b", "e", "h"], ["c", "f", "i"]], columns=["A", "A", "A"]
         )
         tm.assert_frame_equal(df, exp)
 
diff --git a/pandas/tests/frame/indexing/test_setitem.py b/pandas/tests/frame/indexing/test_setitem.py
@@ -146,18 +146,32 @@ def test_setitem_different_dtype(self):
         )
         tm.assert_series_equal(result, expected)
 
-    def test_setitem_empty_columns(self):
-        # GH 13522
+    def test_setitem_overwrite_index(self):
+        # GH 13522 - assign the index as a column and then overwrite the values
+        # -> should not affect the index
         df = DataFrame(index=["A", "B", "C"])
         df["X"] = df.index
         df["X"] = ["x", "y", "z"]
         exp = DataFrame(
-            data={"X": ["x", "y", "z"]},
-            index=["A", "B", "C"],
-            columns=Index(["X"], dtype=object),
+            data={"X": ["x", "y", "z"]}, index=["A", "B", "C"], columns=["X"]
         )
         tm.assert_frame_equal(df, exp)
 
+    def test_setitem_empty_columns(self):
+        # Starting from an empty DataFrame and setting a column should result
+        # in a default string dtype for the columns' Index
+        # https://github.com/pandas-dev/pandas/issues/60338
+
+        df = DataFrame()
+        df["foo"] = [1, 2, 3]
+        expected = DataFrame({"foo": [1, 2, 3]})
+        tm.assert_frame_equal(df, expected)
+
+        df = DataFrame(columns=Index([]))
+        df["foo"] = [1, 2, 3]
+        expected = DataFrame({"foo": [1, 2, 3]})
+        tm.assert_frame_equal(df, expected)
+
     def test_setitem_dt64_index_empty_columns(self):
         rng = date_range("1/1/2000 00:00:00", "1/1/2000 1:59:50", freq="10s")
         df = DataFrame(index=np.arange(len(rng)))
@@ -171,9 +185,7 @@ def test_setitem_timestamp_empty_columns(self):
         df["now"] = Timestamp("20130101", tz="UTC").as_unit("ns")
 
         expected = DataFrame(
-            [[Timestamp("20130101", tz="UTC")]] * 3,
-            index=range(3),
-            columns=Index(["now"], dtype=object),
+            [[Timestamp("20130101", tz="UTC")]] * 3, index=range(3), columns=["now"]
         )
         tm.assert_frame_equal(df, expected)
 
@@ -212,7 +224,7 @@ def test_setitem_period_preserves_dtype(self):
         result = DataFrame([])
         result["a"] = data
 
-        expected = DataFrame({"a": data}, columns=Index(["a"], dtype=object))
+        expected = DataFrame({"a": data}, columns=["a"])
 
         tm.assert_frame_equal(result, expected)
 
@@ -939,7 +951,7 @@ def test_setitem_scalars_no_index(self):
         # GH#16823 / GH#17894
         df = DataFrame()
         df["foo"] = 1
-        expected = DataFrame(columns=Index(["foo"], dtype=object)).astype(np.int64)
+        expected = DataFrame(columns=["foo"]).astype(np.int64)
         tm.assert_frame_equal(df, expected)
 
     def test_setitem_newcol_tuple_key(self, float_frame):
diff --git a/pandas/tests/frame/methods/test_dropna.py b/pandas/tests/frame/methods/test_dropna.py
@@ -182,12 +182,9 @@ def test_dropna_multiple_axes(self):
         with pytest.raises(TypeError, match="supplying multiple axes"):
             inp.dropna(how="all", axis=(0, 1), inplace=True)
 
-    def test_dropna_tz_aware_datetime(self, using_infer_string):
+    def test_dropna_tz_aware_datetime(self):
         # GH13407
-
         df = DataFrame()
-        if using_infer_string:
-            df.columns = df.columns.astype("str")
         dt1 = datetime.datetime(2015, 1, 1, tzinfo=dateutil.tz.tzutc())
         dt2 = datetime.datetime(2015, 2, 2, tzinfo=dateutil.tz.tzutc())
         df["Time"] = [dt1]
diff --git a/pandas/tests/frame/methods/test_reset_index.py b/pandas/tests/frame/methods/test_reset_index.py
@@ -4,8 +4,6 @@
 import numpy as np
 import pytest
 
-from pandas._config import using_string_dtype
-
 from pandas.core.dtypes.common import (
     is_float_dtype,
     is_integer_dtype,
@@ -646,7 +644,6 @@ def test_rest_index_multiindex_categorical_with_missing_values(self, codes):
         tm.assert_frame_equal(res, expected)
 
 
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string) - GH#60338")
 @pytest.mark.parametrize(
     "array, dtype",
     [
@@ -783,3 +780,34 @@ def test_reset_index_false_index_name():
     result_frame.reset_index()
     expected_frame = DataFrame(range(5, 10), RangeIndex(range(5), name=False))
     tm.assert_frame_equal(result_frame, expected_frame)
+
+
+@pytest.mark.parametrize("columns", [None, Index([])])
+def test_reset_index_with_empty_frame(columns):
+    # Currently empty DataFrame has RangeIndex or object dtype Index, but when
+    # resetting the index we still want to end up with the default string dtype
+    # https://github.com/pandas-dev/pandas/issues/60338
+
+    index = Index([], name="foo")
+    df = DataFrame(index=index, columns=columns)
+    result = df.reset_index()
+    expected = DataFrame(columns=["foo"])
+    tm.assert_frame_equal(result, expected)
+
+    index = Index([1, 2, 3], name="foo")
+    df = DataFrame(index=index, columns=columns)
+    result = df.reset_index()
+    expected = DataFrame({"foo": [1, 2, 3]})
+    tm.assert_frame_equal(result, expected)
+
+    index = MultiIndex.from_tuples([], names=["foo", "bar"])
+    df = DataFrame(index=index, columns=columns)
+    result = df.reset_index()
+    expected = DataFrame(columns=["foo", "bar"])
+    tm.assert_frame_equal(result, expected)
+
+    index = MultiIndex.from_tuples([(1, 2), (2, 3)], names=["foo", "bar"])
+    df = DataFrame(index=index, columns=columns)
+    result = df.reset_index()
+    expected = DataFrame({"foo": [1, 2], "bar": [2, 3]})
+    tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py
@@ -21,8 +21,6 @@
 import pytest
 import pytz
 
-from pandas._config import using_string_dtype
-
 from pandas._libs import lib
 from pandas.compat.numpy import np_version_gt2
 from pandas.errors import IntCastingNaNError
@@ -2002,7 +2000,6 @@ def test_constructor_with_datetimes4(self):
         df = DataFrame({"value": dr})
         assert str(df.iat[0, 0].tz) == "US/Eastern"
 
-    @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
     def test_constructor_with_datetimes5(self):
         # GH 7822
         # preserver an index with a tz on dict construction
diff --git a/pandas/tests/frame/test_query_eval.py b/pandas/tests/frame/test_query_eval.py
@@ -757,7 +757,6 @@ def test_check_tz_aware_index_query(self, tz_aware_fixture):
         tm.assert_frame_equal(result, expected)
 
         expected = DataFrame(df_index)
-        expected.columns = expected.columns.astype(object)
         result = df.reset_index().query('"2018-01-03 00:00:00+00" < time')
         tm.assert_frame_equal(result, expected)
 
diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py
@@ -1623,7 +1623,7 @@ def test_groupby_2d_malformed():
     d["label"] = ["l1", "l2"]
     tmp = d.groupby(["group"]).mean(numeric_only=True)
     res_values = np.array([[0.0, 1.0], [0.0, 1.0]])
-    tm.assert_index_equal(tmp.columns, Index(["zeros", "ones"], dtype=object))
+    tm.assert_index_equal(tmp.columns, Index(["zeros", "ones"]))
     tm.assert_numpy_array_equal(tmp.values, res_values)
 
 
diff --git a/pandas/tests/indexes/base_class/test_reshape.py b/pandas/tests/indexes/base_class/test_reshape.py
@@ -34,7 +34,7 @@ def test_insert(self):
 
         # test empty
         null_index = Index([])
-        tm.assert_index_equal(Index(["a"], dtype=object), null_index.insert(0, "a"))
+        tm.assert_index_equal(Index(["a"]), null_index.insert(0, "a"))
 
     def test_insert_missing(self, request, nulls_fixture, using_infer_string):
         if using_infer_string and nulls_fixture is pd.NA:
diff --git a/pandas/tests/indexes/base_class/test_setops.py b/pandas/tests/indexes/base_class/test_setops.py
@@ -240,7 +240,6 @@ def test_tuple_union_bug(self, method, expected, sort):
     def test_union_name_preservation(
         self, first_list, second_list, first_name, second_name, expected_name, sort
     ):
-        expected_dtype = object if not first_list or not second_list else "str"
         first = Index(first_list, name=first_name)
         second = Index(second_list, name=second_name)
         union = first.union(second, sort=sort)
@@ -251,7 +250,7 @@ def test_union_name_preservation(
             expected = Index(sorted(vals), name=expected_name)
             tm.assert_index_equal(union, expected)
         else:
-            expected = Index(vals, name=expected_name, dtype=expected_dtype)
+            expected = Index(vals, name=expected_name)
             tm.assert_index_equal(union.sort_values(), expected.sort_values())
 
     @pytest.mark.parametrize(
diff --git a/pandas/tests/indexes/datetimes/test_join.py b/pandas/tests/indexes/datetimes/test_join.py
@@ -70,13 +70,17 @@ def test_join_utc_convert(self, join_type):
         assert isinstance(result, DatetimeIndex)
         assert result.tz is timezone.utc
 
-    def test_datetimeindex_union_join_empty(self, sort):
+    def test_datetimeindex_union_join_empty(self, sort, using_infer_string):
         dti = date_range(start="1/1/2001", end="2/1/2001", freq="D")
         empty = Index([])
 
         result = dti.union(empty, sort=sort)
-        expected = dti.astype("O")
-        tm.assert_index_equal(result, expected)
+        if using_infer_string:
+            assert isinstance(result, DatetimeIndex)
+            tm.assert_index_equal(result, dti)
+        else:
+            expected = dti.astype("O")
+            tm.assert_index_equal(result, expected)
 
         result = dti.join(empty)
         assert isinstance(result, DatetimeIndex)
diff --git a/pandas/tests/indexes/test_old_base.py b/pandas/tests/indexes/test_old_base.py
@@ -442,10 +442,12 @@ def test_insert_out_of_bounds(self, index, using_infer_string):
         else:
             msg = "slice indices must be integers or None or have an __index__ method"
 
-        if using_infer_string and (
-            index.dtype == "string" or index.dtype == "category"  # noqa: PLR1714
-        ):
-            msg = "loc must be an integer between"
+        if using_infer_string:
+            if index.dtype == "string" or index.dtype == "category":  # noqa: PLR1714
+                msg = "loc must be an integer between"
+            elif index.dtype == "object" and len(index) == 0:
+                msg = "loc must be an integer between"
+                err = TypeError
 
         with pytest.raises(err, match=msg):
             index.insert(0.5, "foo")
diff --git a/pandas/tests/indexes/test_setops.py b/pandas/tests/indexes/test_setops.py
@@ -524,7 +524,7 @@ def test_intersection_difference_match_empty(self, index, sort):
 @pytest.mark.parametrize(
     "method", ["intersection", "union", "difference", "symmetric_difference"]
 )
-def test_setop_with_categorical(index_flat, sort, method):
+def test_setop_with_categorical(index_flat, sort, method, using_infer_string):
     # MultiIndex tested separately in tests.indexes.multi.test_setops
     index = index_flat
 
@@ -533,10 +533,22 @@ def test_setop_with_categorical(index_flat, sort, method):
 
     result = getattr(index, method)(other, sort=sort)
     expected = getattr(index, method)(index, sort=sort)
+    if (
+        using_infer_string
+        and index.empty
+        and method in ("union", "symmetric_difference")
+    ):
+        expected = expected.astype("category")
     tm.assert_index_equal(result, expected, exact=exact)
 
     result = getattr(index, method)(other[:5], sort=sort)
     expected = getattr(index, method)(index[:5], sort=sort)
+    if (
+        using_infer_string
+        and index.empty
+        and method in ("union", "symmetric_difference")
+    ):
+        expected = expected.astype("category")
     tm.assert_index_equal(result, expected, exact=exact)
 
 
diff --git a/pandas/tests/indexing/test_at.py b/pandas/tests/indexing/test_at.py
@@ -13,7 +13,6 @@
     CategoricalIndex,
     DataFrame,
     DatetimeIndex,
-    Index,
     MultiIndex,
     Series,
     Timestamp,
@@ -71,11 +70,7 @@ def test_at_setitem_item_cache_cleared(self):
         df.at[0, "x"] = 4
         df.at[0, "cost"] = 789
 
-        expected = DataFrame(
-            {"x": [4], "cost": 789},
-            index=[0],
-            columns=Index(["x", "cost"], dtype=object),
-        )
+        expected = DataFrame({"x": [4], "cost": 789}, index=[0])
         tm.assert_frame_equal(df, expected)
 
         # And in particular, check that the _item_cache has updated correctly.
diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py
diff --git a/pandas/tests/indexing/test_partial.py b/pandas/tests/indexing/test_partial.py
diff --git a/pandas/tests/series/indexing/test_indexing.py b/pandas/tests/series/indexing/test_indexing.py
diff --git a/pandas/tests/series/indexing/test_setitem.py b/pandas/tests/series/indexing/test_setitem.py
diff --git a/pandas/tests/series/methods/test_combine_first.py b/pandas/tests/series/methods/test_combine_first.py

Original file line number	Diff line number	Diff line change
`@@ -1206,7 +1206,7 @@ def test_loc_setitem_datetimelike_with_inference(self):`
`1206`	`1206`	`result = df.dtypes`
`1207`	`1207`	`expected = Series(`
`1208`	`1208`	`[np.dtype("timedelta64[ns]")] * 6 + [np.dtype("datetime64[ns]")] * 2,`
`1209`		`- index=Index(list("ABCDEFGH"), dtype=object),`
	`1209`	`+ index=list("ABCDEFGH"),`
`1210`	`1210`	`)`
`1211`	`1211`	`tm.assert_series_equal(result, expected)`
`1212`	`1212`
Original file line number	Diff line number	Diff line change
`@@ -67,8 +67,7 @@ def test_insert_with_columns_dups(self):`
`67`	`67`	`df.insert(0, "A", ["d", "e", "f"], allow_duplicates=True)`
`68`	`68`	`df.insert(0, "A", ["a", "b", "c"], allow_duplicates=True)`
`69`	`69`	`exp = DataFrame(`
`70`		`- [["a", "d", "g"], ["b", "e", "h"], ["c", "f", "i"]],`
`71`		`- columns=Index(["A", "A", "A"], dtype=object),`
	`70`	`+ [["a", "d", "g"], ["b", "e", "h"], ["c", "f", "i"]], columns=["A", "A", "A"]`
`72`	`71`	`)`
`73`	`72`	`tm.assert_frame_equal(df, exp)`
`74`	`73`