diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 27b222b8024ce..0e58a5d1b3591 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -6420,31 +6420,15 @@ def _append_list_of_frames(self, other, *args, **kwargs): _obj_type = kwargs['_obj_type'] _item_type = kwargs.get('_item_type') - from pandas.core.indexes.api import ( - CannotSortError, - _normalize_dataframes, - ) + from pandas.core.indexes.api import _normalize_dataframes from pandas.core.reshape.concat import concat - # The default value of sort in version 0.23.0 is None. - # The behavior when this was the value is very - # varied and changes according to input type, columns index - # type, whether a reindex is necessary or not, etc. - # - # The code below is a try to reproduce the old behavior, - # but note that this is deprecated. - # - # TODO: handle sort=None here - - # The behavior of concat is a bit problematic as it is. To get around - # this, we prepare the DataFrames before feeding them into concat. + # TODO: sorting behavior when sort=None + + # The behavior of concat is a bit problematic as it is. To get around, + # we prepare the DataFrames before feeding them into concat. to_concat = [self] + other - try: - to_concat_norm = _normalize_dataframes(to_concat, sort=sort) - except CannotSortError: - raise TypeError("The resulting columns could not be sorted." - " You can try setting sort=False or use" - " compatible index types.") + to_concat_norm = _normalize_dataframes(to_concat, sort=sort) result = concat(to_concat_norm, ignore_index=ignore_index, verify_integrity=verify_integrity, sort=sort) @@ -6454,45 +6438,13 @@ def _append_list_of_frames(self, other, *args, **kwargs): if not ignore_index: result.index.name = self.index.name - # the conditionals below will be refactored or removed - - if sort is None: - # The sorting behaviour for None was weird. - # It is getting deprecated. - # - # By now, fix tests by only sorting when the - # original 'other' was a series or a dict. - if _obj_type in (dict, Series): - sort = False - elif _item_type in (dict, Series): - # A list of dicts/Series had a different behaviour - # when sorting is None. - # - # We do not sort if the 'other' columns are all - # contained in self.columns. Otherwise we do - # sort. - # - # TODO: as per documentation, this seems like the original - # behaviour intended for append. Should I implement this - # for any inputs that come? - self_idx = self.columns - other_idx = other[0].columns - idx_diff = other_idx.difference(self_idx) - sort = len(idx_diff) > 0 - else: - sort = True - + # Reindexing the columns created an artificial float64 where it + # was not needed. We can convert the columns back to the expected + # type. if result.shape[0] == 1: - from pandas.core.dtypes.cast import find_common_type - - # Reindexing the columns created an artificial float64 where it - # was not needed. We can convert the columns back to the expected - # type. - - for col in result: - types = [df[col].dtype for df in to_concat if col in df] - common_type = find_common_type(types) - result[col] = result[col].astype(common_type) + base_frame = next(df for df in to_concat_norm if df.shape[0] == 1) + dtypes = base_frame.dtypes.to_dict() + result = result.astype(dtypes) # won't work well dups cols return result diff --git a/pandas/core/indexes/api.py b/pandas/core/indexes/api.py index f70dee43c112d..f4d67ec6649a4 100644 --- a/pandas/core/indexes/api.py +++ b/pandas/core/indexes/api.py @@ -12,8 +12,7 @@ _new_Index, ensure_index, ensure_index_from_sequences, - CannotSortError, - InvalidIndexError + InvalidIndexError, ) from pandas.core.indexes.category import CategoricalIndex # noqa from pandas.core.indexes.multi import MultiIndex # noqa @@ -38,6 +37,18 @@ """) +class _CannotSortError(Exception): + pass + + +class _CannotSortDuplicatesError(Exception): + pass + + +class _DuplicatesError(Exception): + pass + + # TODO: there are many places that rely on these private methods existing in # pandas.core.index __all__ = ['Index', 'MultiIndex', 'NumericIndex', 'Float64Index', 'Int64Index', @@ -181,20 +192,40 @@ def _normalize_dataframes(frame_list, verify_inputs=True, sort=False): ---------- index_list: list of Index objects verify_inputs: boolean, default True - Verify if the input indexes contain overlapping values. + Verify if the input indexes contain duplicate values. Ignored when all + input indexes share the same identity (a is b). sort: boolean, default False - Order result index. If False, values will come in the order they + Order resulting index. If False, values will come in the order they appear. Raises ------ - CannotSortError - When sort=True and the result index is not sortable. - InvalidIndexError - When verify_inputs=True and 1+ of the indexes contain duplicates. + InvalidIndexError: + When there are duplicates in at least one of the indexes (col) + and they are not allowed. + TypeError: + When sort=True and the resulting index (col) could not be sorted. """ orig_columns = [df.columns for df in frame_list] - merged_columns = _merge_index_list(orig_columns, verify_inputs, sort) + + kwargs = { + 'verify_dups': verify_inputs, + 'allow_matching_dups': verify_inputs, + 'sort': sort, + } + + try: + merged_columns = _merge_index_list(orig_columns, **kwargs) + except _DuplicatesError: + raise InvalidIndexError("Indexes with duplicates are only allowed" + " when they are the same (a is b).") + except _CannotSortDuplicatesError: + raise InvalidIndexError("When sort=True, indexes with duplicate" + " values are not allowed.") + except _CannotSortError: + raise TypeError("The resulting columns could not be sorted." + " You can try setting sort=False or use" + " compatible index types.") # Because _merge_index_list may infer the index dtype based on values, # we have to provide a workaround to conserve the original dtype. @@ -217,33 +248,64 @@ def _normalize_dataframes(frame_list, verify_inputs=True, sort=False): return [_reindex(df, merged_columns, axis=1) for df in frame_list] -def _merge_index_list(index_list, verify_inputs=True, sort=False): +def _merge_index_list(index_list, + verify_dups=True, + allow_matching_dups=False, + sort=False): """Merge a list of indexes into one big index Parameters ---------- index_list: list of Index objects - verify_inputs: boolean, default True - Verify if the input indexes contain overlapping values. + verify_dups: boolean, default True + Verify if the input indexes contain duplicate values. + allow_matching_dups: boolean, default False + Only relevant when verify_dups=True. Allow duplicate values when all + indexes have the same identity. sort: boolean, default False Order result index. If False, values will come in the order they appear. Raises ------ - CannotSortError + _CannotSortError When sort=True and the result index is not sortable. - InvalidIndexError - When verify_inputs=True and 1+ of the indexes contain duplicates. + _CannotSortDuplicatesError + When sort=True and at least one of the inputs contain duplicate + values. + _DuplicatesError + When verify_dups=True and at least one of the input indexes contain + duplicate values. This is error is not raised if + allow_matching_dups=True and all the indexes have a common identity. """ - if verify_inputs: - if any([ix.has_duplicates for ix in index_list]): - raise InvalidIndexError("Input index has duplicate values") - - result = index_list[0] - for idx in index_list[1:]: + # unique index list (a is b) + uindex_list = com.get_distinct_objs(index_list) + + # verify duplicates + if sort or verify_dups: + has_dups = any(ix.has_duplicates for ix in uindex_list) + if has_dups: + if sort: + raise _CannotSortDuplicatesError("Cannot sort an index that" + " contains duplicate values.") + elif verify_dups and not allow_matching_dups: + raise _DuplicatesError("Index has duplicate values.") + elif verify_dups and allow_matching_dups and len(uindex_list) >= 2: + raise _DuplicatesError("Index has duplicate values and does" + " not match other indexes.") + + # edge results + if len(uindex_list) == 0: + return pd.Index() + elif len(uindex_list) == 1: + return uindex_list[0] + + # reduce to one result + result = uindex_list[0] + for idx in uindex_list[1:]: result = _merge_indexes(result, idx) + # sort return result if not sort else _sort_index(result) @@ -278,7 +340,7 @@ def _sort_index(index): try: return index.sort_values() except TypeError: - raise CannotSortError + raise _CannotSortError def _reindex(df, new_index, axis=0): diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 40ce8faeba6bb..b2b6e02e908c5 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -158,10 +158,6 @@ def index_arithmetic_method(self, other): return set_function_name(index_arithmetic_method, name, cls) -class CannotSortError(Exception): - pass - - class InvalidIndexError(Exception): pass diff --git a/pandas/tests/reshape/test_append.py b/pandas/tests/reshape/test_append.py index d848b06857a2e..bb9c864d81353 100644 --- a/pandas/tests/reshape/test_append.py +++ b/pandas/tests/reshape/test_append.py @@ -13,6 +13,7 @@ # base pd.Index(['A', 'B', 'C']), + pd.Index(['A', 'B', 'C'], name='foo'), # numeric pd.RangeIndex(3), @@ -38,6 +39,36 @@ ] +indexes_with_dups = [ + # base + pd.Index(['A', 'B', 'B']), + pd.Index(['B', 'B', 'A']), + pd.Index(['A', 'B', 'B'], name='foo'), + pd.Index(['B', 'B', 'A'], name='bar'), + + # numeric + pd.Index([9, 10, 10], dtype=object), + pd.Int64Index([3, 4, 4]), + pd.UInt64Index([6, 7, 7]), + pd.Float64Index([3.5, 4.5, 4.5]), + + # datetime + pd.to_datetime(['2013-01-01', '2013-01-10', '2013-01-10']), + pd.to_timedelta(['1 day', '2 days', '2 days']), + pd.PeriodIndex([2000, 2001, 2001], freq='A'), + + # interval + pd.IntervalIndex.from_arrays([0, 1, 1], [1, 2, 2]), + + # categorical + pd.CategoricalIndex('A B B'.split()), + pd.CategoricalIndex('D E E'.split(), ordered=True), + + # multi-index + pd.MultiIndex.from_arrays(['A B B'.split(), 'D E E'.split()]), +] + + index_sort_groups = [ # When indexes from the same group are joined, the result is sortable. # When indexes from different groups are joined, the result is not @@ -246,7 +277,7 @@ def test_bad_input_type(self, sort): def test_no_unecessary_upcast(self, sort): # GH: 22621 - # When appending, the resulting columns should + # When appending, the result columns should # not be float64 without necessity. # basic @@ -270,20 +301,16 @@ def test_no_unecessary_upcast(self, sort): assert_frame_equal(result, expected) # 0 rows 2 columns - # (the original dtype (object) of the empty columns - # must be preserved) df1 = pd.DataFrame([[1, 2, 3]], columns=[0, 1, 2]) df2 = pd.DataFrame(columns=[3, 4]) result = df1.append(df2, sort=sort) expected = pd.DataFrame([[1, 2, 3, np.nan, np.nan]]) - expected[[3, 4]] = expected[[3, 4]].astype(object) assert_frame_equal(result, expected) df1 = pd.DataFrame(columns=[0, 1]) df2 = pd.DataFrame([[1, 2, 3]], columns=[2, 3, 4]) result = df1.append(df2, sort=sort) expected = pd.DataFrame([[np.nan, np.nan, 1, 2, 3]]) - expected[[0, 1]] = expected[[0, 1]].astype(object) assert_frame_equal(result, expected) # big.append(small) @@ -300,71 +327,6 @@ def test_no_unecessary_upcast(self, sort): expected = pd.DataFrame([[1, 2, np.nan], [3, 4, 5]]) assert_frame_equal(result, expected) - def test_preserve_empty_columns_dtype(self, sort): - # When appending to an empty DataFrame with columns, the dtype of these - # columns should be accounted for the output. - - # append same size (default dtype) - df1 = pd.DataFrame(columns=list('ABC')) # object - df2 = pd.DataFrame([[1, 2, 3]], columns=list('ABC')) - - result1 = df1.append(df2, sort=sort) - result2 = df2.append(df1, sort=sort) - - expected = df2.astype(object) - assert_frame_equal(result1, expected) - assert_frame_equal(result2, expected) - - # GH: 22858 - df1 ends up float64 - # append same size (int64) - # df1 = pd.DataFrame(columns=list('ABC'), dtype='int64') - # df2 = pd.DataFrame([[1, 2, 3]], columns=list('ABC')) - - # result1 = df1.append(df2, sort=sort) - # result2 = df2.append(df1, sort=sort) - - # expected = df2.astype('int64') # same as df2 - # assert_frame_equal(result1, expected) - # assert_frame_equal(result2, expected) - - # append same size (float64) - df1 = pd.DataFrame(columns=list('ABC'), dtype='float64') - df2 = pd.DataFrame([[1, 2, 3]], columns=list('ABC')) - - result1 = df1.append(df2, sort=sort) - result2 = df2.append(df1, sort=sort) - - expected = df2.astype('float64') - assert_frame_equal(result1, expected) - assert_frame_equal(result2, expected) - - # append small/big - small empty - small = pd.DataFrame(columns=list('AB')) - big = pd.DataFrame([[1, 2, 3]], columns=list('ABC')) - - result1 = small.append(big, sort=sort) - result2 = big.append(small, sort=sort) - - expected = big.copy() - expected[['A', 'B']] = expected[['A', 'B']].astype(object) - assert_frame_equal(result1, expected) - assert_frame_equal(result2, expected) - - # append small/big - big empty - small = pd.DataFrame([[1, 2]], columns=list('AB')) - big = pd.DataFrame(columns=list('ABC')) - - result1 = small.append(big, sort=sort) - result2 = big.append(small, sort=sort) - - expected = pd.DataFrame( - [[1, 2, np.nan]], - columns=list('ABC'), - dtype=object - ) - assert_frame_equal(result1, expected) - assert_frame_equal(result2, expected) - class TestAppendColumnsIndex(object): @pytest.mark.parametrize('idx_name3', [None, 'foo', 'bar', 'baz']) @@ -471,39 +433,90 @@ def test_preserve_index_values_with_sort(self, index1, index2): for value in index2: assert value in result.columns - def test_raise_on_duplicates(self, sort): - # Append should not allow DataFrames with repeated - # column names (or series with repeated row names). - - # dupe on base - df1 = pd.DataFrame([[1, 2, 3]], columns=['A', 'B', 'B']) - df2 = pd.DataFrame([[1, 2, 3]], columns=['A', 'B', 'C']) - with pytest.raises(InvalidIndexError): - df1.append([], sort=sort) - with pytest.raises(InvalidIndexError): - df1.append([df2], sort=sort) - with pytest.raises(InvalidIndexError): - df1.append([df2, df2], sort=sort) - - # dupe on other - df1 = pd.DataFrame([[1, 2, 3]], columns=['A', 'B', 'C']) - df2 = pd.DataFrame([[1, 2, 3]], columns=['A', 'B', 'B']) - with pytest.raises(InvalidIndexError): - df1.append([df2], sort=sort) - with pytest.raises(InvalidIndexError): - df1.append([df2, df2], sort=sort) - - # dupe on both - # (we could avoid raising errors here, but, to keep the api - # consistent, we don't) - df1 = pd.DataFrame([[1, 2, 3]], columns=['A', 'B', 'B']) - df2 = pd.DataFrame([[1, 2, 3]], columns=['A', 'B', 'B']) - with pytest.raises(InvalidIndexError): - df1.append([], sort=sort) - with pytest.raises(InvalidIndexError): - df1.append([df2], sort=sort) - with pytest.raises(InvalidIndexError): - df1.append([df2, df2], sort=sort) + @pytest.mark.parametrize('col_index', indexes_with_dups, ids=cls_name) + def test_good_duplicates_without_sort(self, col_index): + # When all indexes have the same identity (a is b), duplicates should + # be allowed and append works. + + df1 = pd.DataFrame([[1, 2, 3]], columns=col_index) + df2 = pd.DataFrame([[4, 5, 6]], columns=col_index) + + # df1.append([]) + result = df1.append([], sort=False) + expected = df1.copy() + assert_frame_equal(result, expected) + + # df1.append([df2]) + result = df1.append([df2], ignore_index=True, sort=False) + expected = pd.DataFrame([[1, 2, 3], [4, 5, 6]]) + expected.columns = col_index + assert_frame_equal(result, expected) + + # df1.append([df2, df2]) + result = df1.append([df2, df2], ignore_index=True, sort=False) + expected = pd.DataFrame([[1, 2, 3], [4, 5, 6], [4, 5, 6]]) + expected.columns = col_index + assert_frame_equal(result, expected) + + # df2.append([]) + result = df2.append([], sort=False) + expected = df2.copy() + assert_frame_equal(result, expected) + + # df2.append([df1]) + result = df2.append([df1], ignore_index=True, sort=False) + expected = pd.DataFrame([[4, 5, 6], [1, 2, 3]]) + expected.columns = col_index + assert_frame_equal(result, expected) + + # df2.append([df1, df1]) + result = df2.append([df1, df1], ignore_index=True, sort=False) + expected = pd.DataFrame([[4, 5, 6], [1, 2, 3], [1, 2, 3]]) + expected.columns = col_index + assert_frame_equal(result, expected) + + @pytest.mark.parametrize('col_index', indexes_with_dups, ids=cls_name) + def test_bad_duplicates_without_sort(self, col_index): + # When the indexes do not share a common identity, duplicates are not + # allowed and append raises. + + df1 = pd.DataFrame([[1, 2, 3]], columns=col_index) + df2 = pd.DataFrame([[4, 5, 6]], columns=col_index) + df3 = pd.DataFrame([[7, 8, 9]], columns=col_index.copy()) # different + ctx = pytest.raises(InvalidIndexError, + match=r'Indexes with duplicates.*a is b.*') + with ctx: + result = df1.append([df3], sort=False) + with ctx: + result = df1.append([df2, df3], sort=False) + with ctx: + result = df1.append([df3, df2], sort=False) + with ctx: + result = df1.append([df3, df3], sort=False) + + @pytest.mark.parametrize('col_index', indexes_with_dups, ids=cls_name) + def test_duplicates_with_sort(self, col_index): + # When sort=True, indexes with duplicate values are not be allowed. + + df1 = pd.DataFrame([[1, 2, 3]], columns=col_index) + df2 = pd.DataFrame([[4, 5, 6]], columns=col_index.copy()) + ctx = pytest.raises(InvalidIndexError, + match=r'When sort=True, indexes with dupl.*') + + with ctx: + result = df1.append([], sort=True) + with ctx: + result = df1.append([df1], sort=True) + with ctx: + result = df1.append([df2], sort=True) + with ctx: + result = df1.append([df1, df1], sort=True) + with ctx: + result = df1.append([df1, df2], sort=True) + with ctx: + result = df1.append([df2, df1], sort=True) + with ctx: + result = df1.append([df2, df2], sort=True) def test_nosort_basic(self): # When sort=False, the resulting columns come