From 73d64eb086d2edc5881400286cc2049f4cdb1141 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 19 Apr 2018 16:06:23 +0200 Subject: [PATCH 1/5] Fix pd.merge to preserve ExtensionArrays dtypes --- pandas/core/dtypes/common.py | 2 +- pandas/core/internals.py | 4 ++-- pandas/tests/extension/base/reshaping.py | 21 +++++++++++++++++++ .../extension/category/test_categorical.py | 4 ++++ 4 files changed, 28 insertions(+), 3 deletions(-) diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index 3a90feb7ccd7d..c45838e6040a9 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -1807,7 +1807,7 @@ def _get_dtype(arr_or_dtype): return arr_or_dtype elif isinstance(arr_or_dtype, type): return np.dtype(arr_or_dtype) - elif isinstance(arr_or_dtype, CategoricalDtype): + elif isinstance(arr_or_dtype, ExtensionDtype): return arr_or_dtype elif isinstance(arr_or_dtype, DatetimeTZDtype): return arr_or_dtype diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 37d11296400be..c7ffd5e7c2fe1 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -5541,7 +5541,7 @@ def concatenate_join_units(join_units, concat_axis, copy): if len(to_concat) == 1: # Only one block, nothing to concatenate. concat_values = to_concat[0] - if copy and concat_values.base is not None: + if copy and getattr(concat_values, 'base', 1) is not None: concat_values = concat_values.copy() else: concat_values = _concat._concat_compat(to_concat, axis=concat_axis) @@ -5823,7 +5823,7 @@ def get_reindexed_values(self, empty_dtype, upcasted_na): # External code requested filling/upcasting, bool values must # be upcasted to object to avoid being upcasted to numeric. values = self.block.astype(np.object_).values - elif self.block.is_categorical: + elif self.block.is_extension: values = self.block.values else: # No dtype upcasting is done here, it will be performed during diff --git a/pandas/tests/extension/base/reshaping.py b/pandas/tests/extension/base/reshaping.py index efc22c19a3eef..b1e3e87576375 100644 --- a/pandas/tests/extension/base/reshaping.py +++ b/pandas/tests/extension/base/reshaping.py @@ -95,3 +95,24 @@ def test_set_frame_overwrite_object(self, data): df = pd.DataFrame({"A": [1] * len(data)}, dtype=object) df['A'] = data assert df.dtypes['A'] == data.dtype + + def test_merge(self, data, na_value): + + df1 = pd.DataFrame({'int1': [1, 2, 3], 'key': [0, 1, 2], + 'ext': data[:3]}) + df2 = pd.DataFrame({'int2': [1, 2, 3, 4], 'key': [0, 0, 1, 3]}) + + res = pd.merge(df1, df2) + exp = pd.DataFrame( + {'int1': [1, 1, 2], 'int2': [1, 2, 3], 'key': [0, 0, 1], + 'ext': data._constructor_from_sequence( + [data[0], data[0], data[1]])}) + self.assert_frame_equal(res, exp[['ext', 'int1', 'key', 'int2']]) + + res = pd.merge(df1, df2, how='outer') + exp = pd.DataFrame( + {'int1': [1, 1, 2, 3, np.nan], 'int2': [1, 2, 3, np.nan, 4], + 'key': [0, 0, 1, 2, 3], + 'ext': data._constructor_from_sequence( + [data[0], data[0], data[1], data[2], na_value])}) + self.assert_frame_equal(res, exp[['ext', 'int1', 'key', 'int2']]) diff --git a/pandas/tests/extension/category/test_categorical.py b/pandas/tests/extension/category/test_categorical.py index 27c156c15203f..6ebe700f13be0 100644 --- a/pandas/tests/extension/category/test_categorical.py +++ b/pandas/tests/extension/category/test_categorical.py @@ -75,6 +75,10 @@ def test_align(self, data, na_value): def test_align_frame(self, data, na_value): pass + @pytest.mark.skip(reason="Unobserved categories preseved in concat.") + def test_merge(self, data, na_value): + pass + class TestGetitem(base.BaseGetitemTests): @pytest.mark.skip(reason="Backwards compatibility") From 716e928cff5b587b7622a5ba99ea024e0b0672d5 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 20 Apr 2018 07:02:21 -0500 Subject: [PATCH 2/5] Fixed order. The Decimal base class failed to check the global order of columns. Fixed that as well. --- pandas/tests/extension/base/reshaping.py | 4 ++-- pandas/tests/extension/decimal/test_decimal.py | 8 ++++++++ 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/pandas/tests/extension/base/reshaping.py b/pandas/tests/extension/base/reshaping.py index b1e3e87576375..ef9cddc88dcba 100644 --- a/pandas/tests/extension/base/reshaping.py +++ b/pandas/tests/extension/base/reshaping.py @@ -107,7 +107,7 @@ def test_merge(self, data, na_value): {'int1': [1, 1, 2], 'int2': [1, 2, 3], 'key': [0, 0, 1], 'ext': data._constructor_from_sequence( [data[0], data[0], data[1]])}) - self.assert_frame_equal(res, exp[['ext', 'int1', 'key', 'int2']]) + self.assert_frame_equal(res, exp[['int1', 'key', 'ext', 'int2']]) res = pd.merge(df1, df2, how='outer') exp = pd.DataFrame( @@ -115,4 +115,4 @@ def test_merge(self, data, na_value): 'key': [0, 0, 1, 2, 3], 'ext': data._constructor_from_sequence( [data[0], data[0], data[1], data[2], na_value])}) - self.assert_frame_equal(res, exp[['ext', 'int1', 'key', 'int2']]) + self.assert_frame_equal(res, exp[['int1', 'key', 'ext', 'int2']]) diff --git a/pandas/tests/extension/decimal/test_decimal.py b/pandas/tests/extension/decimal/test_decimal.py index d509170565e1a..53d74cd6d38cb 100644 --- a/pandas/tests/extension/decimal/test_decimal.py +++ b/pandas/tests/extension/decimal/test_decimal.py @@ -72,6 +72,14 @@ def assert_series_equal(self, left, right, *args, **kwargs): def assert_frame_equal(self, left, right, *args, **kwargs): # TODO(EA): select_dtypes + tm.assert_index_equal( + left.columns, right.columns, + exact=kwargs.get('check_column_type', 'equiv'), + check_names=kwargs.get('check_names', True), + check_exact=kwargs.get('check_exact', False), + check_categorical=kwargs.get('check_categorical', True), + obj='{obj}.columns'.format(obj=kwargs.get('obj', 'DataFrame'))) + decimals = (left.dtypes == 'decimal').index for col in decimals: From 8824a47345ff41d5d3b88b04abdfe488a20c86bd Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 20 Apr 2018 07:04:08 -0500 Subject: [PATCH 3/5] Added issue number --- pandas/tests/extension/base/reshaping.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/extension/base/reshaping.py b/pandas/tests/extension/base/reshaping.py index ef9cddc88dcba..8776549fe80c3 100644 --- a/pandas/tests/extension/base/reshaping.py +++ b/pandas/tests/extension/base/reshaping.py @@ -97,7 +97,7 @@ def test_set_frame_overwrite_object(self, data): assert df.dtypes['A'] == data.dtype def test_merge(self, data, na_value): - + # GH-20743 df1 = pd.DataFrame({'int1': [1, 2, 3], 'key': [0, 1, 2], 'ext': data[:3]}) df2 = pd.DataFrame({'int2': [1, 2, 3, 4], 'key': [0, 0, 1, 3]}) From 884510c15a11d69af4ce8d2113377b3de1eabbdd Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 20 Apr 2018 17:24:24 +0200 Subject: [PATCH 4/5] copy: check for arrays --- pandas/core/internals.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/pandas/core/internals.py b/pandas/core/internals.py index c7ffd5e7c2fe1..bcb758e30f9ed 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -5541,8 +5541,14 @@ def concatenate_join_units(join_units, concat_axis, copy): if len(to_concat) == 1: # Only one block, nothing to concatenate. concat_values = to_concat[0] - if copy and getattr(concat_values, 'base', 1) is not None: - concat_values = concat_values.copy() + if copy: + if isinstance(concat_values, np.ndarray): + # non-reindexed (=not yet copied) arrays are made into a view + # in JoinUnit.get_reindexed_values + if concat_values.base is not None: + concat_values = concat_values.copy() + elif: + concat_values = concat_values.copy() else: concat_values = _concat._concat_compat(to_concat, axis=concat_axis) From 9cf8cfeb5b1de11cacd5ce766373d5e337f73b46 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 20 Apr 2018 17:28:01 +0200 Subject: [PATCH 5/5] change order in tests for python < 3.6 --- pandas/core/internals.py | 2 +- pandas/tests/extension/base/reshaping.py | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/pandas/core/internals.py b/pandas/core/internals.py index bcb758e30f9ed..e98899b2f5c1a 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -5547,7 +5547,7 @@ def concatenate_join_units(join_units, concat_axis, copy): # in JoinUnit.get_reindexed_values if concat_values.base is not None: concat_values = concat_values.copy() - elif: + else: concat_values = concat_values.copy() else: concat_values = _concat._concat_compat(to_concat, axis=concat_axis) diff --git a/pandas/tests/extension/base/reshaping.py b/pandas/tests/extension/base/reshaping.py index 8776549fe80c3..f50222b82df0f 100644 --- a/pandas/tests/extension/base/reshaping.py +++ b/pandas/tests/extension/base/reshaping.py @@ -98,8 +98,8 @@ def test_set_frame_overwrite_object(self, data): def test_merge(self, data, na_value): # GH-20743 - df1 = pd.DataFrame({'int1': [1, 2, 3], 'key': [0, 1, 2], - 'ext': data[:3]}) + df1 = pd.DataFrame({'ext': data[:3], 'int1': [1, 2, 3], + 'key': [0, 1, 2]}) df2 = pd.DataFrame({'int2': [1, 2, 3, 4], 'key': [0, 0, 1, 3]}) res = pd.merge(df1, df2) @@ -107,7 +107,7 @@ def test_merge(self, data, na_value): {'int1': [1, 1, 2], 'int2': [1, 2, 3], 'key': [0, 0, 1], 'ext': data._constructor_from_sequence( [data[0], data[0], data[1]])}) - self.assert_frame_equal(res, exp[['int1', 'key', 'ext', 'int2']]) + self.assert_frame_equal(res, exp[['ext', 'int1', 'key', 'int2']]) res = pd.merge(df1, df2, how='outer') exp = pd.DataFrame( @@ -115,4 +115,4 @@ def test_merge(self, data, na_value): 'key': [0, 0, 1, 2, 3], 'ext': data._constructor_from_sequence( [data[0], data[0], data[1], data[2], na_value])}) - self.assert_frame_equal(res, exp[['int1', 'key', 'ext', 'int2']]) + self.assert_frame_equal(res, exp[['ext', 'int1', 'key', 'int2']])