From cbf64fb3da745db2bc7275ee81a1e61177196b41 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Mon, 28 Aug 2023 13:16:58 +0200 Subject: [PATCH 1/3] Implement any and all for pyarrow numpy strings (#54591) Co-authored-by: Joris Van den Bossche (cherry picked from commit 23b546f96f49611541585c3914301ba874ced308) --- pandas/core/arrays/string_arrow.py | 13 +++++++++++++ pandas/tests/extension/test_string.py | 6 +++++- pandas/tests/reductions/test_reductions.py | 19 +++++++++++++++++++ 3 files changed, 37 insertions(+), 1 deletion(-) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index bc1d7cb52e196..87dd48b3569d8 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -554,3 +554,16 @@ def value_counts(self, dropna: bool = True): return Series( result._values.to_numpy(), index=result.index, name=result.name, copy=False ) + + def _reduce( + self, name: str, *, skipna: bool = True, keepdims: bool = False, **kwargs + ): + if name in ["any", "all"]: + arr = pc.and_kleene( + pc.invert(pc.is_null(self._pa_array)), pc.not_equal(self._pa_array, "") + ) + return ArrowExtensionArray(arr)._reduce( + name, skipna=skipna, keepdims=keepdims, **kwargs + ) + else: + return super()._reduce(name, skipna=skipna, keepdims=keepdims, **kwargs) diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py index 069d53aeb248f..1211baad0b568 100644 --- a/pandas/tests/extension/test_string.py +++ b/pandas/tests/extension/test_string.py @@ -161,7 +161,11 @@ class TestReduce(base.BaseReduceTests): def test_reduce_series_numeric(self, data, all_numeric_reductions, skipna): op_name = all_numeric_reductions - if op_name in ["min", "max"]: + if ( + op_name in ["min", "max"] + or data.dtype.storage == "pyarrow_numpy" # type: ignore[union-attr] + and op_name in ("any", "all") + ): return None ser = pd.Series(data) diff --git a/pandas/tests/reductions/test_reductions.py b/pandas/tests/reductions/test_reductions.py index 87892a81cef3d..021252500e814 100644 --- a/pandas/tests/reductions/test_reductions.py +++ b/pandas/tests/reductions/test_reductions.py @@ -1078,6 +1078,25 @@ def test_any_all_datetimelike(self): assert df.any().all() assert not df.all().any() + def test_any_all_pyarrow_string(self): + # GH#54591 + pytest.importorskip("pyarrow") + ser = Series(["", "a"], dtype="string[pyarrow_numpy]") + assert ser.any() + assert not ser.all() + + ser = Series([None, "a"], dtype="string[pyarrow_numpy]") + assert ser.any() + assert not ser.all() + + ser = Series([None, ""], dtype="string[pyarrow_numpy]") + assert not ser.any() + assert not ser.all() + + ser = Series(["a", "b"], dtype="string[pyarrow_numpy]") + assert ser.any() + assert ser.all() + def test_timedelta64_analytics(self): # index min/max dti = date_range("2012-1-1", periods=3, freq="D") From 21bacdced1af4eaa185ff942945d46ef6c6d29d1 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Mon, 28 Aug 2023 14:26:13 +0200 Subject: [PATCH 2/3] Fix --- pandas/tests/extension/base/reduce.py | 2 +- pandas/tests/extension/test_string.py | 13 ++++++++----- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/pandas/tests/extension/base/reduce.py b/pandas/tests/extension/base/reduce.py index a6532a6190467..43b2df4290eed 100644 --- a/pandas/tests/extension/base/reduce.py +++ b/pandas/tests/extension/base/reduce.py @@ -25,7 +25,7 @@ def check_reduce(self, s, op_name, skipna): try: alt = s.astype("float64") - except TypeError: + except (TypeError, ValueError): # e.g. Interval can't cast, so let's cast to object and do # the reduction pointwise alt = s.astype(object) diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py index 1211baad0b568..a98187ffbbc84 100644 --- a/pandas/tests/extension/test_string.py +++ b/pandas/tests/extension/test_string.py @@ -157,15 +157,18 @@ def test_fillna_no_op_returns_copy(self, data): class TestReduce(base.BaseReduceTests): + def _supports_reduction(self, ser: pd.Series, op_name: str) -> bool: + return ( + op_name in ["min", "max"] + or ser.dtype.storage == "pyarrow_numpy" # type: ignore[union-attr] + and op_name in ("any", "all") + ) + @pytest.mark.parametrize("skipna", [True, False]) def test_reduce_series_numeric(self, data, all_numeric_reductions, skipna): op_name = all_numeric_reductions - if ( - op_name in ["min", "max"] - or data.dtype.storage == "pyarrow_numpy" # type: ignore[union-attr] - and op_name in ("any", "all") - ): + if op_name in ["min", "max"]: return None ser = pd.Series(data) From 1772b1b9c5f78384496f979fac5088a850c023d3 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Mon, 28 Aug 2023 14:26:27 +0200 Subject: [PATCH 3/3] Fix --- pandas/tests/extension/test_string.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py index a98187ffbbc84..5176289994033 100644 --- a/pandas/tests/extension/test_string.py +++ b/pandas/tests/extension/test_string.py @@ -159,8 +159,7 @@ def test_fillna_no_op_returns_copy(self, data): class TestReduce(base.BaseReduceTests): def _supports_reduction(self, ser: pd.Series, op_name: str) -> bool: return ( - op_name in ["min", "max"] - or ser.dtype.storage == "pyarrow_numpy" # type: ignore[union-attr] + ser.dtype.storage == "pyarrow_numpy" # type: ignore[union-attr] and op_name in ("any", "all") )