From fff8a6b28dfb8b96d7d35f5c6cc8f73b5294d4c5 Mon Sep 17 00:00:00 2001 From: Terji Petersen Date: Mon, 17 Apr 2023 17:00:04 +0100 Subject: [PATCH 1/7] ENH: better dtype inference when doing DataFrame reductions --- pandas/core/frame.py | 28 ++++---- pandas/tests/frame/test_reductions.py | 100 +++++++++++++++++++++++++- pandas/tests/groupby/test_apply.py | 2 +- 3 files changed, 113 insertions(+), 17 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 5341b87c39676..0b060007bed06 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -94,6 +94,7 @@ is_dataclass, is_dict_like, is_dtype_equal, + is_extension_array_dtype, is_float, is_float_dtype, is_hashable, @@ -10899,14 +10900,23 @@ def _get_data() -> DataFrame: # simple case where we can use BlockManager.reduce res = df._mgr.reduce(blk_func) out = df._constructor(res).iloc[0] + mgr_dtypes = df._mgr.get_dtypes().tolist() + if out.dtype != object: + # e.g. if data dtype is UInt8 and out.dtype is uint64, then common is UInt64 + mgr_dtypes.append(out.dtype) + common_dtype = find_common_type(mgr_dtypes) if mgr_dtypes else None + is_ext_dtype = common_dtype is not None and is_extension_array_dtype( + common_dtype + ) + if out_dtype is not None: out = out.astype(out_dtype) + elif is_ext_dtype and out.dtype == common_dtype.type: + out = out.astype(common_dtype) + elif out.dtype == object and isna(out).all(): + out = out.astype(common_dtype) elif (df._mgr.get_dtypes() == object).any(): out = out.astype(object) - elif len(self) == 0 and name in ("sum", "prod"): - # Even if we are object dtype, follow numpy and return - # float64, see test_apply_funcs_over_empty - out = out.astype(np.float64) return out @@ -11157,11 +11167,6 @@ def idxmin( ) indices = res._values - # indices will always be np.ndarray since axis is not None and - # values is a 2d array for DataFrame - # error: Item "int" of "Union[int, Any]" has no attribute "__iter__" - assert isinstance(indices, np.ndarray) # for mypy - index = data._get_axis(axis) result = [index[i] if i >= 0 else np.nan for i in indices] final_result = data._constructor_sliced(result, index=data._get_agg_axis(axis)) @@ -11182,11 +11187,6 @@ def idxmax( ) indices = res._values - # indices will always be np.ndarray since axis is not None and - # values is a 2d array for DataFrame - # error: Item "int" of "Union[int, Any]" has no attribute "__iter__" - assert isinstance(indices, np.ndarray) # for mypy - index = data._get_axis(axis) result = [index[i] if i >= 0 else np.nan for i in indices] final_result = data._constructor_sliced(result, index=data._get_agg_axis(axis)) diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py index 0d352b8e34f37..75c3f81cb7560 100644 --- a/pandas/tests/frame/test_reductions.py +++ b/pandas/tests/frame/test_reductions.py @@ -917,7 +917,7 @@ def test_mean_extensionarray_numeric_only_true(self): arr = np.random.randint(1000, size=(10, 5)) df = DataFrame(arr, dtype="Int64") result = df.mean(numeric_only=True) - expected = DataFrame(arr).mean() + expected = DataFrame(arr, dtype="Float64").mean() tm.assert_series_equal(result, expected) def test_stats_mixed_type(self, float_string_frame): @@ -1544,6 +1544,100 @@ def test_reduction_timedelta_smallest_unit(self): tm.assert_series_equal(result, expected) +class TestEmptyDataFrameReductions: + @pytest.mark.parametrize( + "opname, dtype, exp_value, exp_dtype", + [ + ("sum", np.int8, 0, np.int64), + ("prod", np.int8, 1, np.int64), + ("sum", np.int64, 0, np.int64), + ("prod", np.int64, 1, np.int64), + ("sum", np.uint8, 0, np.int64), + ("prod", np.uint8, 1, np.uint64), + ("sum", np.uint64, 0, np.int64), + ("prod", np.uint64, 1, np.uint64), + ("sum", np.float32, 0, np.float32), + ("prod", np.float32, 1, np.float32), + ("sum", np.float64, 0, np.float64), + ], + ) + def test_df_empty_min_count_0(self, opname, dtype, exp_value, exp_dtype): + df = DataFrame({0: [], 1: []}, dtype=dtype) + result = getattr(df, opname)(min_count=0) + + expected = Series([exp_value, exp_value], dtype=exp_dtype) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize( + "opname, dtype, exp_dtype", + [ + ("sum", np.int8, np.float64), + ("prod", np.int8, np.float64), + ("sum", np.int64, np.float64), + ("prod", np.int64, np.float64), + ("sum", np.uint8, np.float64), + ("prod", np.uint8, np.float64), + ("sum", np.uint64, np.float64), + ("prod", np.uint64, np.float64), + ("sum", np.float32, np.float32), + ("prod", np.float32, np.float32), + ("sum", np.float64, np.float64), + ], + ) + def test_df_empty_min_count_1(self, opname, dtype, exp_dtype): + df = DataFrame({0: [], 1: []}, dtype=dtype) + result = getattr(df, opname)(min_count=1) + + expected = Series([np.nan, np.nan], dtype=exp_dtype) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize( + "opname, dtype, exp_value, exp_dtype", + [ + ("sum", "Int8", 0, "Int64"), + ("prod", "Int8", 1, "Int64"), + ("sum", "Int64", 0, "Int64"), + ("prod", "Int64", 1, "Int64"), + ("sum", "UInt8", 0, "UInt64"), + ("prod", "UInt8", 1, "UInt64"), + ("sum", "UInt64", 0, "UInt64"), + ("prod", "UInt64", 1, "UInt64"), + ("sum", "Float32", 0, "Float32"), + ("prod", "Float32", 1, "Float32"), + ("sum", "Float64", 0, "Float64"), + ], + ) + def test_df_empty_nullable_min_count_0(self, opname, dtype, exp_value, exp_dtype): + df = DataFrame({0: [], 1: []}, dtype=dtype) + result = getattr(df, opname)(min_count=0) + + expected = Series([exp_value, exp_value], dtype=exp_dtype) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize( + "opname, dtype, exp_dtype", + [ + ("sum", "Int8", "Int8"), + ("prod", "Int8", "Int8"), + ("sum", "Int64", "Int64"), + ("prod", "Int64", "Int64"), + ("sum", "UInt8", "UInt8"), + ("prod", "UInt8", "UInt8"), + ("sum", "UInt64", "UInt64"), + ("prod", "UInt64", "UInt64"), + ("sum", "Float32", "Float32"), + ("prod", "Float32", "Float32"), + ("sum", "Float64", "Float64"), + ], + ) + def test_df_empty_nullable_min_count_1(self, opname, dtype, exp_dtype): + df = DataFrame({0: [], 1: []}, dtype=dtype) + result = getattr(df, opname)(min_count=1) + + expected = Series([pd.NA, pd.NA], dtype=exp_dtype) + tm.assert_series_equal(result, expected) + + class TestNuisanceColumns: @pytest.mark.parametrize("method", ["any", "all"]) def test_any_all_categorical_dtype_nuisance_column(self, method): @@ -1678,7 +1772,9 @@ def test_minmax_extensionarray(method, numeric_only): df = DataFrame({"Int64": ser}) result = getattr(df, method)(numeric_only=numeric_only) expected = Series( - [getattr(int64_info, method)], index=Index(["Int64"], dtype="object") + [getattr(int64_info, method)], + index=Index(["Int64"], dtype="object"), + dtype=pd.Int64Dtype(), ) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index e5599d60b4f0d..aaedf00932345 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -945,7 +945,7 @@ def test_apply_multi_level_name(category): b = pd.Categorical(b, categories=[1, 2, 3]) expected_index = pd.CategoricalIndex([1, 2, 3], categories=[1, 2, 3], name="B") # GH#40669 - summing an empty frame gives float dtype - expected_values = [20.0, 25.0, 0.0] + expected_values = [20, 25, 0] else: expected_index = Index([1, 2], name="B") expected_values = [20, 25] From e399af3d78f9d7e784894a1c91db619eb28f260e Mon Sep 17 00:00:00 2001 From: Terji Petersen Date: Mon, 17 Apr 2023 18:55:21 +0100 Subject: [PATCH 2/7] ENH: Better dtype inference when doing reductions on dataframes of nullable arrays --- doc/source/whatsnew/v2.1.0.rst | 35 ++++++++++++++++++++++++++++++---- pandas/core/frame.py | 13 +++++++++++-- 2 files changed, 42 insertions(+), 6 deletions(-) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index 4c1399a0defe7..f11675a78d518 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -14,12 +14,39 @@ including other versions of pandas. Enhancements ~~~~~~~~~~~~ -.. _whatsnew_210.enhancements.enhancement1: +.. _whatsnew_210.enhancements.better_dtype_inference_for_frame_reductions: + +Better dtype inference when doing reductions on dataframes of nullable arrays +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Dtype inference when doing reductions on DataFrames with nullable arrays has been improved. + +*Previous behavior*: + +.. code-block:: ipython + + In [1]: df = pd.DataFrame({"a": [1], "b": [pd.NA]}, dtype="Int64") + In [2]: df.sum() + a 1 + b 0 + dtype: int64 + In [3]: df.sum(min_count=1) + a 1 + b + dtype: object + +With the new behavior, we keep the original dtype: + +*New behavior*: + +.. ipython:: python + + df = pd.DataFrame({"a": [1], "b": [pd.NA]}, dtype="Int64") + df.sum() + df.sum(min_count=1) -enhancement1 -^^^^^^^^^^^^ -.. _whatsnew_210.enhancements.enhancement2: +.. _whatsnew_210.enhancements.map_works_for_all_array_types: ``map(func, na_action="ignore")`` now works for all array types ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 0b060007bed06..63ec449a07633 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -51,6 +51,7 @@ NoDefault, is_range_indexer, no_default, + infer_dtype, ) from pandas.compat import PYPY from pandas.compat._optional import import_optional_dependency @@ -106,6 +107,8 @@ is_sequence, needs_i8_conversion, pandas_dtype, + is_unsigned_integer_dtype, + is_signed_integer_dtype, ) from pandas.core.dtypes.dtypes import ExtensionDtype from pandas.core.dtypes.missing import ( @@ -10913,10 +10916,16 @@ def _get_data() -> DataFrame: out = out.astype(out_dtype) elif is_ext_dtype and out.dtype == common_dtype.type: out = out.astype(common_dtype) - elif out.dtype == object and isna(out).all(): - out = out.astype(common_dtype) elif (df._mgr.get_dtypes() == object).any(): out = out.astype(object) + elif is_ext_dtype and out.dtype == object: + inferred_dtype = infer_dtype(out) + if isna(out).all(): + out = out.astype(common_dtype) + elif inferred_dtype == "integer": + out = out.astype("Int64") + elif inferred_dtype == "float": + out = out.astype("Float64") return out From 7864b034225261f82727014436c189a5a11109d3 Mon Sep 17 00:00:00 2001 From: Terji Petersen Date: Mon, 17 Apr 2023 19:10:25 +0100 Subject: [PATCH 3/7] add issue number --- doc/source/whatsnew/v2.1.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index f11675a78d518..e434d1b3ac3e9 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -19,7 +19,7 @@ Enhancements Better dtype inference when doing reductions on dataframes of nullable arrays ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Dtype inference when doing reductions on DataFrames with nullable arrays has been improved. +Dtype inference when doing reductions on DataFrames with nullable arrays has been improved (:issue:`52707`). *Previous behavior*: From 4f386349e7c01e03052b99879cdfcb1967760ec5 Mon Sep 17 00:00:00 2001 From: Terji Petersen Date: Mon, 17 Apr 2023 19:22:16 +0100 Subject: [PATCH 4/7] various pre-commit stuff --- pandas/core/frame.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 63ec449a07633..2de7eb923b841 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -49,9 +49,9 @@ from pandas._libs.hashtable import duplicated from pandas._libs.lib import ( NoDefault, + infer_dtype, is_range_indexer, no_default, - infer_dtype, ) from pandas.compat import PYPY from pandas.compat._optional import import_optional_dependency @@ -107,8 +107,6 @@ is_sequence, needs_i8_conversion, pandas_dtype, - is_unsigned_integer_dtype, - is_signed_integer_dtype, ) from pandas.core.dtypes.dtypes import ExtensionDtype from pandas.core.dtypes.missing import ( From 60f0b1eeb4bec7f77fc88cca64c9724483ea5673 Mon Sep 17 00:00:00 2001 From: Terji Petersen Date: Mon, 17 Apr 2023 23:21:17 +0100 Subject: [PATCH 5/7] platform issues --- pandas/tests/frame/test_reductions.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py index 75c3f81cb7560..73d72af490f02 100644 --- a/pandas/tests/frame/test_reductions.py +++ b/pandas/tests/frame/test_reductions.py @@ -6,7 +6,7 @@ import numpy as np import pytest -from pandas.compat import is_platform_windows +from pandas.compat import is_platform_windows, IS64 import pandas.util._test_decorators as td import pandas as pd @@ -1549,11 +1549,11 @@ class TestEmptyDataFrameReductions: "opname, dtype, exp_value, exp_dtype", [ ("sum", np.int8, 0, np.int64), - ("prod", np.int8, 1, np.int64), + ("prod", np.int8, 1, np.int_), ("sum", np.int64, 0, np.int64), ("prod", np.int64, 1, np.int64), ("sum", np.uint8, 0, np.int64), - ("prod", np.uint8, 1, np.uint64), + ("prod", np.uint8, 1, np.uint), ("sum", np.uint64, 0, np.int64), ("prod", np.uint64, 1, np.uint64), ("sum", np.float32, 0, np.float32), @@ -1594,12 +1594,12 @@ def test_df_empty_min_count_1(self, opname, dtype, exp_dtype): @pytest.mark.parametrize( "opname, dtype, exp_value, exp_dtype", [ - ("sum", "Int8", 0, "Int64"), - ("prod", "Int8", 1, "Int64"), + ("sum", "Int8", 0, ("Int64" if IS64 else "Int32")), + ("prod", "Int8", 1, ("Int64" if IS64 else "Int32")), ("sum", "Int64", 0, "Int64"), ("prod", "Int64", 1, "Int64"), - ("sum", "UInt8", 0, "UInt64"), - ("prod", "UInt8", 1, "UInt64"), + ("sum", "UInt8", 0, ("UInt64" if IS64 else "UInt32")), + ("prod", "UInt8", 1, ("UInt64" if IS64 else "UInt32")), ("sum", "UInt64", 0, "UInt64"), ("prod", "UInt64", 1, "UInt64"), ("sum", "Float32", 0, "Float32"), From 23093f912bb69786616961532fc9b60aa6e40a8e Mon Sep 17 00:00:00 2001 From: Terji Petersen Date: Mon, 17 Apr 2023 23:29:39 +0100 Subject: [PATCH 6/7] isort --- pandas/tests/frame/test_reductions.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py index 73d72af490f02..ebc0b402a6619 100644 --- a/pandas/tests/frame/test_reductions.py +++ b/pandas/tests/frame/test_reductions.py @@ -6,7 +6,10 @@ import numpy as np import pytest -from pandas.compat import is_platform_windows, IS64 +from pandas.compat import ( + IS64, + is_platform_windows, +) import pandas.util._test_decorators as td import pandas as pd From 1bd00e4da7856b348162dd0a2f0476d7e2700854 Mon Sep 17 00:00:00 2001 From: tp Date: Tue, 18 Apr 2023 08:33:52 +0100 Subject: [PATCH 7/7] fix windows failures --- pandas/tests/frame/test_reductions.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py index ebc0b402a6619..dee8744388678 100644 --- a/pandas/tests/frame/test_reductions.py +++ b/pandas/tests/frame/test_reductions.py @@ -32,6 +32,8 @@ nanops, ) +is_windows_or_is32 = is_platform_windows() or not IS64 + def assert_stat_op_calc( opname, @@ -1597,12 +1599,12 @@ def test_df_empty_min_count_1(self, opname, dtype, exp_dtype): @pytest.mark.parametrize( "opname, dtype, exp_value, exp_dtype", [ - ("sum", "Int8", 0, ("Int64" if IS64 else "Int32")), - ("prod", "Int8", 1, ("Int64" if IS64 else "Int32")), + ("sum", "Int8", 0, ("Int32" if is_windows_or_is32 else "Int64")), + ("prod", "Int8", 1, ("Int32" if is_windows_or_is32 else "Int64")), ("sum", "Int64", 0, "Int64"), ("prod", "Int64", 1, "Int64"), - ("sum", "UInt8", 0, ("UInt64" if IS64 else "UInt32")), - ("prod", "UInt8", 1, ("UInt64" if IS64 else "UInt32")), + ("sum", "UInt8", 0, ("UInt32" if is_windows_or_is32 else "UInt64")), + ("prod", "UInt8", 1, ("UInt32" if is_windows_or_is32 else "UInt64")), ("sum", "UInt64", 0, "UInt64"), ("prod", "UInt64", 1, "UInt64"), ("sum", "Float32", 0, "Float32"),