From 33b82fb4d3fd1e0c9b917ed49ecfbd86457cc83b Mon Sep 17 00:00:00 2001 From: richard Date: Mon, 27 Mar 2023 18:19:25 -0400 Subject: [PATCH 1/4] BUG: Revert GH#51335 --- pandas/core/frame.py | 122 ++++++++++++++++++-------- pandas/tests/frame/test_reductions.py | 15 +++- 2 files changed, 97 insertions(+), 40 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index bcba7c8c13f8c..69fa44c6ebb8b 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -102,6 +102,7 @@ is_integer_dtype, is_iterator, is_list_like, + is_object_dtype, is_scalar, is_sequence, needs_i8_conversion, @@ -10925,44 +10926,93 @@ def _get_data() -> DataFrame: data = self._get_bool_data() return data - # Case with EAs see GH#35881 - df = self - if numeric_only: - df = _get_data() + if numeric_only or axis == 0: + # For numeric_only non-None and axis non-None, we know + # which blocks to use and no try/except is needed. + # For numeric_only=None only the case with axis==0 and no object + # dtypes are unambiguous can be handled with BlockManager.reduce + # Case with EAs see GH#35881 + df = self + if numeric_only: + df = _get_data() + if axis == 1: + df = df.T + axis = 0 + + # After possibly _get_data and transposing, we are now in the + # simple case where we can use BlockManager.reduce + res = df._mgr.reduce(blk_func) + out = df._constructor(res).iloc[0] + if out_dtype is not None: + out = out.astype(out_dtype) + if axis == 0 and len(self) == 0 and name in ["sum", "prod"]: + # Even if we are object dtype, follow numpy and return + # float64, see test_apply_funcs_over_empty + out = out.astype(np.float64) + + return out + + assert not numeric_only and axis in (1, None) + + data = self + values = data.values + result = func(values) + + if hasattr(result, "dtype"): + if filter_type == "bool" and notna(result).all(): + result = result.astype(np.bool_) + elif filter_type is None and is_object_dtype(result.dtype): + try: + result = result.astype(np.float64) + except (ValueError, TypeError): + # try to coerce to the original dtypes item by item if we can + pass + if axis is None: - return func(df.values) - elif axis == 1: - if len(df.index) == 0: - # Taking a transpose would result in no columns, losing the dtype. - # In the empty case, reducing along axis 0 or 1 gives the same - # result dtype, so reduce with axis=0 and ignore values - result = df._reduce( - op, - name, - axis=0, - skipna=skipna, - numeric_only=False, - filter_type=filter_type, - **kwds, - ).iloc[:0] - result.index = df.index - return result - df = df.T - - # After possibly _get_data and transposing, we are now in the - # simple case where we can use BlockManager.reduce - res = df._mgr.reduce(blk_func) - out = df._constructor(res).iloc[0] - if out_dtype is not None: - out = out.astype(out_dtype) - elif (df._mgr.get_dtypes() == object).any(): - out = out.astype(object) - elif len(self) == 0 and name in ("sum", "prod"): - # Even if we are object dtype, follow numpy and return - # float64, see test_apply_funcs_over_empty - out = out.astype(np.float64) + return result - return out + labels = self._get_agg_axis(axis) + result = self._constructor_sliced(result, index=labels) + return result + + # # Case with EAs see GH#35881 + # df = self + # if numeric_only: + # df = _get_data() + # if axis is None: + # return func(df.values) + # elif axis == 1: + # if len(df.index) == 0: + # # Taking a transpose would result in no columns, losing the dtype. + # # In the empty case, reducing along axis 0 or 1 gives the same + # # result dtype, so reduce with axis=0 and ignore values + # result = df._reduce( + # op, + # name, + # axis=0, + # skipna=skipna, + # numeric_only=False, + # filter_type=filter_type, + # **kwds, + # ).iloc[:0] + # result.index = df.index + # return result + # df = df.T + # + # # After possibly _get_data and transposing, we are now in the + # # simple case where we can use BlockManager.reduce + # res = df._mgr.reduce(blk_func) + # out = df._constructor(res).iloc[0] + # if out_dtype is not None: + # out = out.astype(out_dtype) + # elif (df._mgr.get_dtypes() == object).any(): + # out = out.astype(object) + # elif len(self) == 0 and name in ("sum", "prod"): + # # Even if we are object dtype, follow numpy and return + # # float64, see test_apply_funcs_over_empty + # out = out.astype(np.float64) + # + # return out def _reduce_axis1(self, name: str, func, skipna: bool) -> Series: """ diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py index 28809e2ecb788..b57337ebeff76 100644 --- a/pandas/tests/frame/test_reductions.py +++ b/pandas/tests/frame/test_reductions.py @@ -317,8 +317,10 @@ def wrapper(x): DataFrame({0: [np.nan, 2], 1: [np.nan, 3], 2: [np.nan, 4]}, dtype=object), ], ) - def test_stat_operators_attempt_obj_array(self, method, df, axis): + def test_stat_operators_attempt_obj_array(self, method, df, axis, request): # GH#676 + if axis in (1, "columns") or method not in ("sum", "prod", "min", "max"): + request.node.add_marker(pytest.mark.xfail(reason="Revert of GH#51335")) assert df.values.dtype == np.object_ result = getattr(df, method)(axis=axis) expected = getattr(df.astype("f8"), method)(axis=axis).astype(object) @@ -402,6 +404,7 @@ def test_mean_includes_datetimes(self, tz): expected = Series([Timestamp("2000", tz=tz)], index=["A"]) tm.assert_series_equal(result, expected) + @pytest.mark.xfail(reason="Revert of GH#51335") def test_mean_mixed_string_decimal(self): # GH 11670 # possible bug when calculating mean of DataFrame? @@ -731,7 +734,9 @@ def test_sum_corner(self): tm.makePeriodIndex(0), ], ) - def test_axis_1_empty(self, all_reductions, index, using_array_manager): + def test_axis_1_empty(self, all_reductions, index, using_array_manager, request): + if all_reductions not in ("count", "any", "all"): + request.node.add_marker(pytest.mark.xfail(reason="Revert of GH#51335")) df = DataFrame(columns=["a"], index=index) result = getattr(df, all_reductions)(axis=1) if all_reductions in ("any", "all"): @@ -1464,6 +1469,7 @@ def test_preserve_timezone(self, initial: str, method): result = getattr(df, method)(axis=1) tm.assert_series_equal(result, expected) + @pytest.mark.xfail(reason="GH#51335") @pytest.mark.parametrize("method", ["min", "max"]) def test_minmax_tzaware_skipna_axis_1(self, method, skipna): # GH#51242 @@ -1671,9 +1677,10 @@ def test_prod_sum_min_count_mixed_object(): @pytest.mark.parametrize("method", ["min", "max", "mean", "median", "skew", "kurt"]) @pytest.mark.parametrize("numeric_only", [True, False]) -def test_reduction_axis_none_returns_scalar(method, numeric_only): +def test_reduction_axis_none_returns_scalar(method, numeric_only, request): # GH#21597 As of 2.0, axis=None reduces over all axes. - + if numeric_only: + request.node.add_marker(pytest.mark.xfail(reason="Revert of GH#51335")) df = DataFrame(np.random.randn(4, 4)) result = getattr(df, method)(axis=None, numeric_only=numeric_only) From 5cd80c4f4800767cb91d8a1831bac38067543390 Mon Sep 17 00:00:00 2001 From: richard Date: Mon, 27 Mar 2023 18:34:26 -0400 Subject: [PATCH 2/4] cleanup --- pandas/core/frame.py | 39 --------------------------------------- 1 file changed, 39 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 69fa44c6ebb8b..96048a454819f 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -10975,45 +10975,6 @@ def _get_data() -> DataFrame: result = self._constructor_sliced(result, index=labels) return result - # # Case with EAs see GH#35881 - # df = self - # if numeric_only: - # df = _get_data() - # if axis is None: - # return func(df.values) - # elif axis == 1: - # if len(df.index) == 0: - # # Taking a transpose would result in no columns, losing the dtype. - # # In the empty case, reducing along axis 0 or 1 gives the same - # # result dtype, so reduce with axis=0 and ignore values - # result = df._reduce( - # op, - # name, - # axis=0, - # skipna=skipna, - # numeric_only=False, - # filter_type=filter_type, - # **kwds, - # ).iloc[:0] - # result.index = df.index - # return result - # df = df.T - # - # # After possibly _get_data and transposing, we are now in the - # # simple case where we can use BlockManager.reduce - # res = df._mgr.reduce(blk_func) - # out = df._constructor(res).iloc[0] - # if out_dtype is not None: - # out = out.astype(out_dtype) - # elif (df._mgr.get_dtypes() == object).any(): - # out = out.astype(object) - # elif len(self) == 0 and name in ("sum", "prod"): - # # Even if we are object dtype, follow numpy and return - # # float64, see test_apply_funcs_over_empty - # out = out.astype(np.float64) - # - # return out - def _reduce_axis1(self, name: str, func, skipna: bool) -> Series: """ Special case for _reduce to try to avoid a potentially-expensive transpose. From dbf63e358791ba06bfacfac5c336951205336399 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Tue, 28 Mar 2023 17:06:12 -0400 Subject: [PATCH 3/4] revert whatsnew --- doc/source/whatsnew/v2.0.0.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 80b52d3b3955e..02d6a3c4312cc 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -777,7 +777,7 @@ Other API changes - The levels of the index of the :class:`Series` returned from ``Series.sparse.from_coo`` now always have dtype ``int32``. Previously they had dtype ``int64`` (:issue:`50926`) - :func:`to_datetime` with ``unit`` of either "Y" or "M" will now raise if a sequence contains a non-round ``float`` value, matching the ``Timestamp`` behavior (:issue:`50301`) - The methods :meth:`Series.round`, :meth:`DataFrame.__invert__`, :meth:`Series.__invert__`, :meth:`DataFrame.swapaxes`, :meth:`DataFrame.first`, :meth:`DataFrame.last`, :meth:`Series.first`, :meth:`Series.last` and :meth:`DataFrame.align` will now always return new objects (:issue:`51032`) -- :class:`DataFrame` and :class:`DataFrameGroupBy` aggregations (e.g. "sum") with object-dtype columns no longer infer non-object dtypes for their results, explicitly call ``result.infer_objects(copy=False)`` on the result to obtain the old behavior (:issue:`51205`, :issue:`49603`) +- :class:`DataFrameGroupBy` aggregations (e.g. "sum") with object-dtype columns no longer infer non-object dtypes for their results, explicitly call ``result.infer_objects(copy=False)`` on the result to obtain the old behavior (:issue:`51205`, :issue:`49603`) - Division by zero with :class:`ArrowDtype` dtypes returns ``-inf``, ``nan``, or ``inf`` depending on the numerator, instead of raising (:issue:`51541`) - Added :func:`pandas.api.types.is_any_real_numeric_dtype` to check for real numeric dtypes (:issue:`51152`) - :meth:`~arrays.ArrowExtensionArray.value_counts` now returns data with :class:`ArrowDtype` with ``pyarrow.int64`` type instead of ``"Int64"`` type (:issue:`51462`) @@ -1204,11 +1204,11 @@ Numeric ^^^^^^^ - Bug in :meth:`DataFrame.add` cannot apply ufunc when inputs contain mixed DataFrame type and Series type (:issue:`39853`) - Bug in arithmetic operations on :class:`Series` not propagating mask when combining masked dtypes and numpy dtypes (:issue:`45810`, :issue:`42630`) +- Bug in DataFrame reduction methods (e.g. :meth:`DataFrame.sum`) with object dtype, ``axis=1`` and ``numeric_only=False`` would not be coerced to float (:issue:`49551`) - Bug in :meth:`DataFrame.sem` and :meth:`Series.sem` where an erroneous ``TypeError`` would always raise when using data backed by an :class:`ArrowDtype` (:issue:`49759`) - Bug in :meth:`Series.__add__` casting to object for list and masked :class:`Series` (:issue:`22962`) - Bug in :meth:`~arrays.ArrowExtensionArray.mode` where ``dropna=False`` was not respected when there was ``NA`` values (:issue:`50982`) - Bug in :meth:`DataFrame.query` with ``engine="numexpr"`` and column names are ``min`` or ``max`` would raise a ``TypeError`` (:issue:`50937`) -- Bug in :meth:`DataFrame.min` and :meth:`DataFrame.max` with tz-aware data containing ``pd.NaT`` and ``axis=1`` would return incorrect results (:issue:`51242`) Conversion ^^^^^^^^^^ From cb43a0963f5f9ee05148cad75f3628d33253309f Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Tue, 28 Mar 2023 17:13:40 -0400 Subject: [PATCH 4/4] xfail for array manager --- pandas/tests/frame/test_reductions.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py index b57337ebeff76..b40ba4bf48eaa 100644 --- a/pandas/tests/frame/test_reductions.py +++ b/pandas/tests/frame/test_reductions.py @@ -317,9 +317,15 @@ def wrapper(x): DataFrame({0: [np.nan, 2], 1: [np.nan, 3], 2: [np.nan, 4]}, dtype=object), ], ) - def test_stat_operators_attempt_obj_array(self, method, df, axis, request): + def test_stat_operators_attempt_obj_array( + self, method, df, axis, request, using_array_manager + ): # GH#676 - if axis in (1, "columns") or method not in ("sum", "prod", "min", "max"): + if ( + axis in (1, "columns") + or method not in ("sum", "prod", "min", "max") + or using_array_manager + ): request.node.add_marker(pytest.mark.xfail(reason="Revert of GH#51335")) assert df.values.dtype == np.object_ result = getattr(df, method)(axis=axis)