From 96d893322a4548d6d37132eb343d223f88aa77f5 Mon Sep 17 00:00:00 2001 From: araraonline Date: Fri, 14 Sep 2018 11:44:34 -0300 Subject: [PATCH 01/30] Add test cases --- pandas/tests/reshape/test_concat.py | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/pandas/tests/reshape/test_concat.py b/pandas/tests/reshape/test_concat.py index 762b04cc3bd4f..fbddf6c4f7c6b 100644 --- a/pandas/tests/reshape/test_concat.py +++ b/pandas/tests/reshape/test_concat.py @@ -1009,6 +1009,33 @@ def test_append_missing_column_proper_upcast(self, sort): assert appended['A'].dtype == 'f8' assert appended['B'].dtype == 'O' + def test_append_series_no_unecessary_upcast(self): + # case 1 + # append to DataFrame with no rows + # columns from the Series should preserve the Series dtype + + df = pd.DataFrame() + series = pd.Series([1, 2, 3], name=0) + result = df.append(series) + expected = pd.DataFrame([[1, 2, 3]]) + assert_frame_equal(result, expected) + + df = pd.DataFrame(columns=[0]) + series = pd.Series([1, 2, 3], index=[1, 2, 3], name=0) + result = df.append(series) + expected = pd.DataFrame([[np.nan, 1, 2, 3]], columns=[0, 1, 2, 3]) + assert_frame_equal(result, expected) + + # case 2 + # append to DataFrame bigger than Series + # columns that come from both should preserve the dtype + + df = pd.DataFrame([[1, 2, 3, 4]]) + series = pd.Series([1, 2, 3], name=1) + result = df.append(series) + expected = pd.DataFrame([[1, 2, 3, 4.], [1, 2, 3, np.nan]]) + assert_frame_equal(result, expected) + class TestConcatenate(ConcatenateBase): From fbd41eb6ad0baf788f34dda23b97c917703cd85d Mon Sep 17 00:00:00 2001 From: araraonline Date: Fri, 14 Sep 2018 20:21:01 -0300 Subject: [PATCH 02/30] Rewrite `DataFrame.append` --- pandas/core/frame.py | 84 ++++++++++++++++++++++++++------------------ 1 file changed, 49 insertions(+), 35 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 251bc6587872d..c656f5412dfc5 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -6338,45 +6338,59 @@ def append(self, other, ignore_index=False, 3 3 4 4 """ - if isinstance(other, (Series, dict)): - if isinstance(other, dict): - other = Series(other) - if other.name is None and not ignore_index: - raise TypeError('Can only append a Series if ignore_index=True' - ' or if the Series has a name') - - if other.name is None: - index = None + kwargs = { + 'ignore_index': ignore_index, + 'verify_integrity': verify_integrity, + 'sort': sort, + } + + obj_type = type(other) + if issubclass(obj_type, dict): + return self._append_dict(other, **kwargs) + elif issubclass(obj_type, Series): + return self._append_series(other, **kwargs) + elif issubclass(obj_type, DataFrame): + return self._append_frame(other, **kwargs) + elif issubclass(obj_type, list): + try: + item_type = type(other[0]) + except IndexError: # empty list! + return self._append_list_of_frames(other, **kwargs) + + if issubclass(item_type, dict): + return self._append_list_of_dicts(other, **kwargs) + elif issubclass(item_type, Series): + return self._append_list_of_series(other, **kwargs) + elif issubclass(item_type, DataFrame): + return self._append_list_of_frames(other, **kwargs) else: - # other must have the same index name as self, otherwise - # index name will be reset - index = Index([other.name], name=self.index.name) + raise TypeError # TODO + else: + raise TypeError # TODO - idx_diff = other.index.difference(self.columns) - try: - combined_columns = self.columns.append(idx_diff) - except TypeError: - combined_columns = self.columns.astype(object).append(idx_diff) - other = other.reindex(combined_columns, copy=False) - other = DataFrame(other.values.reshape((1, len(other))), - index=index, - columns=combined_columns) - other = other._convert(datetime=True, timedelta=True) - if not self.columns.equals(combined_columns): - self = self.reindex(columns=combined_columns) - elif isinstance(other, list) and not isinstance(other[0], DataFrame): - other = DataFrame(other) - if (self.columns.get_indexer(other.columns) >= 0).all(): - other = other.loc[:, self.columns] + def _append_dict(self, other, *args, **kwargs): + return self._append_list_of_dicts(Series(other), *args, **kwargs) + + def _append_series(self, other, *args, **kwargs): + return self._append_list_of_series([other], *args, **kwargs) + + def _append_frame(self, other, *args, **kwargs): + return self._append_list_of_frames([other], *args, **kwargs) + def _append_list_of_dicts(self, other, *args, **kwargs): + return self._append_frame(DataFrame(other), *args, **kwargs) + + def _append_list_of_series(self, other, *args, **kwargs): + return self._append_frame(DataFrame(other), *args, **kwargs) + + def _append_list_of_frames(self, other, *args, **kwargs): from pandas.core.reshape.concat import concat - if isinstance(other, (list, tuple)): - to_concat = [self] + other - else: - to_concat = [self, other] - return concat(to_concat, ignore_index=ignore_index, - verify_integrity=verify_integrity, - sort=sort) + to_concat = [self] + other + return concat(to_concat, **kwargs) + # return concat(to_concat, ignore_index=ignore_index, + # verify_integrity=verify_integrity, + # sort=sort) + # TODO: ignore bad arguments def join(self, other, on=None, how='left', lsuffix='', rsuffix='', sort=False): From 5db58d8e02736121e40b9b6901472671a2443118 Mon Sep 17 00:00:00 2001 From: araraonline Date: Fri, 14 Sep 2018 11:57:58 -0300 Subject: [PATCH 03/30] Fix test case The test fixed another thing, but got broken by this PR due to its expected result being based indirectly in the bad upcasting of dtypes. --- pandas/tests/reshape/test_concat.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/pandas/tests/reshape/test_concat.py b/pandas/tests/reshape/test_concat.py index fbddf6c4f7c6b..36181fe6b3d12 100644 --- a/pandas/tests/reshape/test_concat.py +++ b/pandas/tests/reshape/test_concat.py @@ -904,7 +904,7 @@ def test_append_same_columns_type(self, index): ser_index = index[:2] ser = pd.Series([7, 8], index=ser_index, name=2) result = df.append(ser) - expected = pd.DataFrame([[1., 2., 3.], [4, 5, 6], [7, 8, np.nan]], + expected = pd.DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, np.nan]], index=[0, 1, 2], columns=index) assert_frame_equal(result, expected) @@ -1009,21 +1009,22 @@ def test_append_missing_column_proper_upcast(self, sort): assert appended['A'].dtype == 'f8' assert appended['B'].dtype == 'O' - def test_append_series_no_unecessary_upcast(self): + def test_append_series_no_unecessary_upcast(self, sort): # case 1 # append to DataFrame with no rows # columns from the Series should preserve the Series dtype df = pd.DataFrame() series = pd.Series([1, 2, 3], name=0) - result = df.append(series) + result = df.append(series, sort=sort) expected = pd.DataFrame([[1, 2, 3]]) assert_frame_equal(result, expected) df = pd.DataFrame(columns=[0]) series = pd.Series([1, 2, 3], index=[1, 2, 3], name=0) - result = df.append(series) + result = df.append(series, sort=sort) expected = pd.DataFrame([[np.nan, 1, 2, 3]], columns=[0, 1, 2, 3]) + expected[0] = expected[0].astype(object) # original dtype of df assert_frame_equal(result, expected) # case 2 @@ -1032,7 +1033,7 @@ def test_append_series_no_unecessary_upcast(self): df = pd.DataFrame([[1, 2, 3, 4]]) series = pd.Series([1, 2, 3], name=1) - result = df.append(series) + result = df.append(series, sort=sort) expected = pd.DataFrame([[1, 2, 3, 4.], [1, 2, 3, np.nan]]) assert_frame_equal(result, expected) From 788c6855fd70dbb8f041bf3344468d5a3415bb7e Mon Sep 17 00:00:00 2001 From: araraonline Date: Sat, 15 Sep 2018 01:15:22 -0300 Subject: [PATCH 04/30] Rewrite the sorting behaviour The sorting behaviour of append was pretty messy (still is, but to a lesser extent). When passed a dict or series, the append function would reindexing both self and other, and then send these to the concat function. When passed a list of series or dicts, the list would be transformed to a dframe and, if its columns were contained in self, we would reshape it to match those of self. Other types would be sent directly to concat though. The concat function, by itself, has a pretty consistent sorting for when sort=True, but doesn't seem so consistent for when sort=False or sort=None (I haven't dig much deeper, but pandas/core/indexes/api.py:_union_indexes sorts some arrays even if the sort argument is False. So, to solve this mess, I tried for the simplest approach. I'm leaving concat untouched and focusing only on append. Do the concatenating as usual and, if the sort parameter was False, we can reindex the resulting dframe to match what we would expect from an unsorted axis. Also, I needed to do some juggling because of the sort=None that is being deprecated. When 'other' is a dict or series and sort=None, we can set sort=False, and it will work like before. If 'other' is something else, we just avoid doing the reindex in the end. I haven't checked all cases, so I'm not sure if this is exact the same working as before. But hopefully, we can provide more tests to pinpoint the sorting behaviour of append better, and this will be enough by now. --- pandas/core/frame.py | 39 ++++++++++++++++++++++++++++++++++----- 1 file changed, 34 insertions(+), 5 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index c656f5412dfc5..f55f7db181494 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -6369,9 +6369,11 @@ def append(self, other, ignore_index=False, raise TypeError # TODO def _append_dict(self, other, *args, **kwargs): + kwargs['nosort_flag'] = True return self._append_list_of_dicts(Series(other), *args, **kwargs) def _append_series(self, other, *args, **kwargs): + kwargs['nosort_flag'] = True return self._append_list_of_series([other], *args, **kwargs) def _append_frame(self, other, *args, **kwargs): @@ -6384,13 +6386,40 @@ def _append_list_of_series(self, other, *args, **kwargs): return self._append_frame(DataFrame(other), *args, **kwargs) def _append_list_of_frames(self, other, *args, **kwargs): + ignore_index = kwargs['ignore_index'] + verify_integrity = kwargs['verify_integrity'] + sort = kwargs['sort'] + nosort_flag = kwargs.get('nosort_flag', False) # do not sort if sort is None + + import pandas.core.common as com from pandas.core.reshape.concat import concat + to_concat = [self] + other - return concat(to_concat, **kwargs) - # return concat(to_concat, ignore_index=ignore_index, - # verify_integrity=verify_integrity, - # sort=sort) - # TODO: ignore bad arguments + result = concat(to_concat, ignore_index=ignore_index, + verify_integrity=verify_integrity, sort=sort) + + if sort is None: + # The sorting behaviour for None was weird. + # It is getting deprecated. + # + # By now, fix tests by only sorting when the + # original 'other' was a series or a dict. + sort = not nosort_flag + + if not sort: + # Concat sorts the column indexes if they are 'special'. + # We don't want this behaviour if sort is False. + result_idx = self.columns + for frame in other: + column_idx = frame.columns + idx_diff = column_idx.difference(result_idx) + try: + result_idx = result_idx.append(idx_diff) + except TypeError: + result_idx = result_idx.astype(object).append(idx_diff) + result = result.reindex(columns=result_idx, copy=False) + + return result def join(self, other, on=None, how='left', lsuffix='', rsuffix='', sort=False): From b3059aaa82ad4a434a85c4ff2fdf15a3e64167cb Mon Sep 17 00:00:00 2001 From: araraonline Date: Sat, 15 Sep 2018 18:48:22 -0300 Subject: [PATCH 05/30] Let dtypes keep the same if result has one row --- pandas/core/frame.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index f55f7db181494..5090b50eb62f5 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -6419,6 +6419,20 @@ def _append_list_of_frames(self, other, *args, **kwargs): result_idx = result_idx.astype(object).append(idx_diff) result = result.reindex(columns=result_idx, copy=False) + if result.shape[0] == 1: + # If we got only one row of result, this means that + # the resulting dtypes can match the dframe where + # they come from. + # + # Concat achieves this behaviour when concatenating + # an empty DataFrame, but not if it has some columns. + # + # This is a hack for retrieving the dtypes back. + base_frame = [frame for frame in to_concat if frame.shape[0] == 1][0] + base_columns = base_frame.columns + base_dtypes = base_frame.dtypes + result[base_columns] = result[base_columns].astype(base_dtypes) + return result def join(self, other, on=None, how='left', lsuffix='', rsuffix='', From 1fe121f5d143d7e64bd84e753c524371731f7dbe Mon Sep 17 00:00:00 2001 From: araraonline Date: Sat, 15 Sep 2018 19:40:07 -0300 Subject: [PATCH 06/30] Add check for ignore_index --- pandas/core/frame.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 5090b50eb62f5..e483f2cb370ad 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -6380,9 +6380,15 @@ def _append_frame(self, other, *args, **kwargs): return self._append_list_of_frames([other], *args, **kwargs) def _append_list_of_dicts(self, other, *args, **kwargs): + if not kwargs['ignore_index']: + raise TypeError('Can only append a dict if ignore_index=True') return self._append_frame(DataFrame(other), *args, **kwargs) def _append_list_of_series(self, other, *args, **kwargs): + if not kwargs['ignore_index']: + if any(series.name is None for series in other): + raise TypeError('Can only append a Series if ignore_index=True' + 'or if the Series has a name') return self._append_frame(DataFrame(other), *args, **kwargs) def _append_list_of_frames(self, other, *args, **kwargs): From 945e96bf487a39930b7f83b537e6f4273ef0f53a Mon Sep 17 00:00:00 2001 From: araraonline Date: Sat, 15 Sep 2018 21:01:40 -0300 Subject: [PATCH 07/30] Fix _append_dict behaviour --- pandas/core/frame.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index e483f2cb370ad..0947c41e00014 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -6370,7 +6370,7 @@ def append(self, other, ignore_index=False, def _append_dict(self, other, *args, **kwargs): kwargs['nosort_flag'] = True - return self._append_list_of_dicts(Series(other), *args, **kwargs) + return self._append_list_of_dicts([other], *args, **kwargs) def _append_series(self, other, *args, **kwargs): kwargs['nosort_flag'] = True From 8c16f53bc5e40270a00e31af5cdeec96b4cd9858 Mon Sep 17 00:00:00 2001 From: araraonline Date: Sat, 15 Sep 2018 21:02:03 -0300 Subject: [PATCH 08/30] More sort mechanisms --- pandas/core/frame.py | 28 +++++++++++++++++++++++----- 1 file changed, 23 insertions(+), 5 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 0947c41e00014..20accb3737b82 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -6345,6 +6345,7 @@ def append(self, other, ignore_index=False, } obj_type = type(other) + kwargs['_obj_type'] = obj_type if issubclass(obj_type, dict): return self._append_dict(other, **kwargs) elif issubclass(obj_type, Series): @@ -6356,6 +6357,7 @@ def append(self, other, ignore_index=False, item_type = type(other[0]) except IndexError: # empty list! return self._append_list_of_frames(other, **kwargs) + kwargs['_item_type'] = item_type if issubclass(item_type, dict): return self._append_list_of_dicts(other, **kwargs) @@ -6369,11 +6371,9 @@ def append(self, other, ignore_index=False, raise TypeError # TODO def _append_dict(self, other, *args, **kwargs): - kwargs['nosort_flag'] = True return self._append_list_of_dicts([other], *args, **kwargs) def _append_series(self, other, *args, **kwargs): - kwargs['nosort_flag'] = True return self._append_list_of_series([other], *args, **kwargs) def _append_frame(self, other, *args, **kwargs): @@ -6395,9 +6395,9 @@ def _append_list_of_frames(self, other, *args, **kwargs): ignore_index = kwargs['ignore_index'] verify_integrity = kwargs['verify_integrity'] sort = kwargs['sort'] - nosort_flag = kwargs.get('nosort_flag', False) # do not sort if sort is None + _obj_type = kwargs['_obj_type'] + _item_type = kwargs.get('_item_type') - import pandas.core.common as com from pandas.core.reshape.concat import concat to_concat = [self] + other @@ -6410,7 +6410,25 @@ def _append_list_of_frames(self, other, *args, **kwargs): # # By now, fix tests by only sorting when the # original 'other' was a series or a dict. - sort = not nosort_flag + if _obj_type in (dict, Series): + sort = False + elif _item_type in (dict, Series): + # A list of dicts/Series had a different behaviour + # when sorting is None. + # + # We do not sort if the 'other' columns are all + # contained in self.columns. Otherwise we do + # sort. + # + # TODO: as per documentation, this seems like the original + # behaviour intended for append. Should I implement this + # for any inputs that come? + self_idx = self.columns + other_idx = other[0].columns + idx_diff = other_idx.difference(self_idx) + sort = len(idx_diff) > 0 + else: + sort = True if not sort: # Concat sorts the column indexes if they are 'special'. From 6fcf54681b93e7479543e12a131fe25a2cadd514 Mon Sep 17 00:00:00 2001 From: araraonline Date: Sun, 16 Sep 2018 00:11:04 -0300 Subject: [PATCH 09/30] Make test catch more exceptions There was a test that checked whether an exception was raised when appending to a series whose index could not be joined together. This test checked exclusively for TypeError, and this was what the old code produced during a dframe reindex at pandas/core/frame.py:6198. With the new code, however, the reindex is only being called at the end, and so, the sorts of error that occur tend to vary. For example, we got AttribteError, ValueError and TypeError. While this is not perfect, the situation didn't change much, it will only break for someone who was catching errors like this with a `except TypeError` clause. Despite this, the message that appeared was cryptic and still is. A better solution would involve informing the user of the reason such error ocurred, and keep the TypeError, to let things backward-compatible. --- pandas/tests/reshape/test_concat.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/reshape/test_concat.py b/pandas/tests/reshape/test_concat.py index 36181fe6b3d12..3fea10bf99944 100644 --- a/pandas/tests/reshape/test_concat.py +++ b/pandas/tests/reshape/test_concat.py @@ -958,13 +958,13 @@ def test_append_different_columns_types_raises( df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], columns=index_can_append) ser = pd.Series([7, 8, 9], index=index_cannot_append_with_other, name=2) - with pytest.raises(TypeError): + with pytest.raises((AttributeError, ValueError, TypeError)): df.append(ser) df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], columns=index_cannot_append_with_other) ser = pd.Series([7, 8, 9], index=index_can_append, name=2) - with pytest.raises(TypeError): + with pytest.raises((AttributeError, ValueError, TypeError)): df.append(ser) def test_append_dtype_coerce(self, sort): From 734f2e3df0e4ad4520e302541a3068e3454065f4 Mon Sep 17 00:00:00 2001 From: araraonline Date: Sun, 16 Sep 2018 01:14:02 -0300 Subject: [PATCH 10/30] Performance fix --- pandas/core/frame.py | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 20accb3737b82..c35f27192af0b 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -6389,7 +6389,25 @@ def _append_list_of_series(self, other, *args, **kwargs): if any(series.name is None for series in other): raise TypeError('Can only append a Series if ignore_index=True' 'or if the Series has a name') - return self._append_frame(DataFrame(other), *args, **kwargs) + + if len(other) == 1: + # manually create DF for performance + ser = other[0] + + if ser.name is not None: + # other must have the same index name as self, + # otherwise index name will be reset + index = Index([ser.name], name=self.index.name) + else: + index = None + columns = ser.index + + df = DataFrame(ser.values.reshape(1, ser.shape[0]), + index=index, columns=columns) + else: + df = DataFrame(other) + + return self._append_frame(df, *args, **kwargs) def _append_list_of_frames(self, other, *args, **kwargs): ignore_index = kwargs['ignore_index'] From 1ad6e82aa4dcc22cdc55b70479daafbadd8290c3 Mon Sep 17 00:00:00 2001 From: araraonline Date: Sun, 16 Sep 2018 21:40:21 -0300 Subject: [PATCH 11/30] Add test case --- pandas/tests/reshape/test_concat.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/pandas/tests/reshape/test_concat.py b/pandas/tests/reshape/test_concat.py index 3fea10bf99944..bb7a09bf7debd 100644 --- a/pandas/tests/reshape/test_concat.py +++ b/pandas/tests/reshape/test_concat.py @@ -1037,6 +1037,16 @@ def test_append_series_no_unecessary_upcast(self, sort): expected = pd.DataFrame([[1, 2, 3, 4.], [1, 2, 3, np.nan]]) assert_frame_equal(result, expected) + # case 3 + # append to DataFrame smaller than Series + # columns that come from both should preserve the dtype + + df = pd.DataFrame([[1, 2, 3]]) + series = pd.Series([1, 2, 3, 4], name=1) + result = df.append(series, sort=sort) + expected = pd.DataFrame([[1, 2, 3, np.nan], [1, 2, 3, 4.]]) + assert_frame_equal(result, expected) + class TestConcatenate(ConcatenateBase): From dab07302f1b357d82a51442b34f4ab6cbf479b07 Mon Sep 17 00:00:00 2001 From: araraonline Date: Sun, 16 Sep 2018 21:47:18 -0300 Subject: [PATCH 12/30] missing space --- pandas/core/frame.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index c35f27192af0b..a41428e2d2486 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -6388,7 +6388,7 @@ def _append_list_of_series(self, other, *args, **kwargs): if not kwargs['ignore_index']: if any(series.name is None for series in other): raise TypeError('Can only append a Series if ignore_index=True' - 'or if the Series has a name') + ' or if the Series has a name') if len(other) == 1: # manually create DF for performance From 3314bc51cc9ace94d2c68a9a2e4cd64b01edaa2b Mon Sep 17 00:00:00 2001 From: araraonline Date: Sun, 16 Sep 2018 23:45:29 -0300 Subject: [PATCH 13/30] Write TypeError messages for append --- pandas/core/frame.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index a41428e2d2486..0b9926c47ce6a 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -6353,10 +6353,14 @@ def append(self, other, ignore_index=False, elif issubclass(obj_type, DataFrame): return self._append_frame(other, **kwargs) elif issubclass(obj_type, list): + try: item_type = type(other[0]) except IndexError: # empty list! return self._append_list_of_frames(other, **kwargs) + if not all(isinstance(i, item_type) for i in other[1:]): + raise TypeError("When other is a list, its elements must" + " be all of the same type") kwargs['_item_type'] = item_type if issubclass(item_type, dict): @@ -6366,9 +6370,13 @@ def append(self, other, ignore_index=False, elif issubclass(item_type, DataFrame): return self._append_list_of_frames(other, **kwargs) else: - raise TypeError # TODO + raise TypeError("The value of other must be a" + " DataFrame or Series/dict-like object," + " or list of these") else: - raise TypeError # TODO + raise TypeError("The value of other must be a" + " DataFrame or Series/dict-like object," + " or list of these") def _append_dict(self, other, *args, **kwargs): return self._append_list_of_dicts([other], *args, **kwargs) From 96153c3ee4f1a61b68339b4eb6a7d91860f29f9c Mon Sep 17 00:00:00 2001 From: araraonline Date: Mon, 17 Sep 2018 00:02:31 -0300 Subject: [PATCH 14/30] Write TypeError split for mixed lists --- pandas/core/frame.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 0b9926c47ce6a..07c070bb58d72 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -6359,8 +6359,13 @@ def append(self, other, ignore_index=False, except IndexError: # empty list! return self._append_list_of_frames(other, **kwargs) if not all(isinstance(i, item_type) for i in other[1:]): - raise TypeError("When other is a list, its elements must" - " be all of the same type") + if issubclass(item_type, (dict, Series, DataFrame)): + raise TypeError("When other is a list, its elements must" + " be all of the same type") + else: + raise TypeError("The value of other must be a" + " DataFrame or Series/dict-like object," + " or list of these") kwargs['_item_type'] = item_type if issubclass(item_type, dict): From facafb8fa08c0c2a72e79c25f6bffb4a0ff078d5 Mon Sep 17 00:00:00 2001 From: araraonline Date: Mon, 17 Sep 2018 19:02:57 -0300 Subject: [PATCH 15/30] Preserve name of index in result --- pandas/core/frame.py | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 07c070bb58d72..f186880fdf482 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -6406,17 +6406,8 @@ def _append_list_of_series(self, other, *args, **kwargs): if len(other) == 1: # manually create DF for performance ser = other[0] - - if ser.name is not None: - # other must have the same index name as self, - # otherwise index name will be reset - index = Index([ser.name], name=self.index.name) - else: - index = None - columns = ser.index - df = DataFrame(ser.values.reshape(1, ser.shape[0]), - index=index, columns=columns) + index=[ser.name], columns=ser.index) else: df = DataFrame(other) @@ -6488,6 +6479,11 @@ def _append_list_of_frames(self, other, *args, **kwargs): base_dtypes = base_frame.dtypes result[base_columns] = result[base_columns].astype(base_dtypes) + if not ignore_index: + # We want to keep the index name of the original dframe (self). + # Rename the index after concat erases it. + result.index.name = self.index.name + return result def join(self, other, on=None, how='left', lsuffix='', rsuffix='', From ecc24678098589412fb4f84f8cf12938a2e21901 Mon Sep 17 00:00:00 2001 From: araraonline Date: Wed, 19 Sep 2018 12:51:02 -0300 Subject: [PATCH 16/30] Preserve columns index name --- pandas/core/frame.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index f186880fdf482..36a1aba411d1c 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -6484,6 +6484,11 @@ def _append_list_of_frames(self, other, *args, **kwargs): # Rename the index after concat erases it. result.index.name = self.index.name + # keep the same column index name as + # the original dframe (self) + # XXX: will this break anything in MultiIndex? + result.columns.name = self.columns.name + return result def join(self, other, on=None, how='left', lsuffix='', rsuffix='', From 583de41c3c021dba700ff85939263e672bd8f90c Mon Sep 17 00:00:00 2001 From: araraonline Date: Mon, 24 Sep 2018 21:34:41 -0300 Subject: [PATCH 17/30] Create/Isolate new tests Lots of tests have been created. Tests scope: - Input types - Bad coercion to float (difficult to imagine, but I feel it was tested fully) - Rows index - Index name - Dtypes (+coercing) - Columns index - Index name - Dtypes (+ coercing) - Check duplicates - Sorting behavior \* Some tests not yet implemented: - Checks for duplicates in the rows index (linked with verify_integrity) - Ignoring rows index (ignore_index) --> Remember to use a modified assert_index_equal (or assert_frame_equal) - Behavior related to ignore_index (e.g. raising when unnamed Series is passed as arg) \*: The sorting tests forget about sort=None completely --- pandas/tests/reshape/test_append.py | 758 ++++++++++++++++++++++++++++ pandas/tests/reshape/test_concat.py | 38 -- 2 files changed, 758 insertions(+), 38 deletions(-) create mode 100644 pandas/tests/reshape/test_append.py diff --git a/pandas/tests/reshape/test_append.py b/pandas/tests/reshape/test_append.py new file mode 100644 index 0000000000000..d096f01a6dda9 --- /dev/null +++ b/pandas/tests/reshape/test_append.py @@ -0,0 +1,758 @@ +from itertools import product + +import numpy as np +import pytest + +import pandas as pd +from pandas.core.indexes.base import InvalidIndexError +from pandas.util.testing import assert_frame_equal + + +indexes = [ + # indexes listed here must be sorted + + # base + pd.Index(['A', 'B', 'C']), + + # numeric + pd.RangeIndex(3), + pd.Int64Index([3, 4, 5]), + pd.UInt64Index([6, 7, 8]), + pd.Float64Index([3.5, 4.5, 5.5]), + + # datetime + pd.to_datetime(['2013-01-01', '2013-01-10', '2013-01-15']), + pd.to_timedelta(['1 day', '2 days', '3 days']), + pd.PeriodIndex(start='2000', periods=3), + + # interval + pd.interval_range(start=0, end=3), + + # categorical + pd.CategoricalIndex('A B C'.split()), + pd.CategoricalIndex('D E F'.split(), ordered=True), + + # multi-index + pd.MultiIndex.from_arrays(['A B C'.split(), 'D E F'.split()]), +] + + +index_sort_groups = [ + # When indexes from the same group are joined, the result is sortable. + # When indexes from different groups are joined, the result is not + # sortable. + + [ # joining produces a string index + pd.Index(['A', 'B', 'C']), + pd.CategoricalIndex('A B C'.split()), + pd.CategoricalIndex('D E F'.split(), ordered=True)], + + [ # numeric indexes + pd.RangeIndex(3), + pd.Int64Index([3, 4, 5]), + pd.UInt64Index([6, 7, 8]), + pd.Float64Index([3.5, 4.5, 5.5])], + + [pd.to_datetime(['2013-01-01', '2013-01-10', '2013-01-15'])], + [pd.to_timedelta(['1 day', '2 days', '3 days'])], + [pd.PeriodIndex(start='2000', periods=3)], + [pd.interval_range(start=0, end=3)], + [pd.MultiIndex.from_arrays(['A B C'.split(), 'D E F'.split()])], +] + + +def cls_name(obj): + return obj.__class__.__name__ + + +@pytest.fixture(params=[True, False]) +def sort(request): + """Boolean sort keyword for DataFrame.append + """ + return request.param + + +class TestAppendBasic(object): + def test_different_types_of_input(self, sort): + # There are 7 types of accepted input by append: + # + # dict + # Series + # DataFrame + # empty list + # list of dicts + # list of Series + # list of DataFrames + # + # Using one or another should always be interchangeable. + + # append to dict + df = pd.DataFrame([[1, 2, 3], [4, 5, 6]]) + map = { + 0: 7, + 1: 8, + 2: 9 + } + result = df.append(map, ignore_index=True, sort=sort) + expected = pd.DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) + assert_frame_equal(result, expected) + + # append to Series + df = pd.DataFrame([[1, 2, 3], [4, 5, 6]]) + ser = pd.Series([7, 8, 9]) + result = df.append(ser, ignore_index=True, sort=sort) + expected = pd.DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) + assert_frame_equal(result, expected) + + # append to DataFrame + df1 = pd.DataFrame([[1, 2, 3], [4, 5, 6]]) + df2 = pd.DataFrame([[7, 8, 9]]) + result = df1.append(df2, ignore_index=True, sort=sort) + expected = pd.DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) + assert_frame_equal(result, expected) + + # append to empty list + df = pd.DataFrame([[1, 2, 3], [4, 5, 6]]) + result = df1.append([], sort=sort) + expected = df + assert_frame_equal(result, expected) + + # append to list of dicts + df = pd.DataFrame([[1, 2, 3], [4, 5, 6]]) + map = { + 0: 7, + 1: 8, + 2: 9 + } + result = df.append([map], ignore_index=True, sort=sort) + expected = pd.DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) + assert_frame_equal(result, expected) + + # append to list of Series + df = pd.DataFrame([[1, 2, 3], [4, 5, 6]]) + ser = pd.Series([7, 8, 9]) + result = df.append([ser], ignore_index=True, sort=sort) + expected = pd.DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) + assert_frame_equal(result, expected) + + # append to list of DataFrames + df1 = pd.DataFrame([[1, 2, 3], [4, 5, 6]]) + df2 = pd.DataFrame([[7, 8, 9]]) + result = df1.append([df2], ignore_index=True, sort=sort) + expected = pd.DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) + assert_frame_equal(result, expected) + + # append to list of dicts (2 dicts) + df = pd.DataFrame([[1, 2, 3], [4, 5, 6]]) + map = { + 0: 7, + 1: 8, + 2: 9 + } + result = df.append([map, map], ignore_index=True, sort=sort) + expected = pd.DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9], [7, 8, 9]]) + assert_frame_equal(result, expected) + + # append to list of Series (2 series) + df = pd.DataFrame([[1, 2, 3], [4, 5, 6]]) + ser = pd.Series([7, 8, 9]) + result = df.append([ser, ser], ignore_index=True, sort=sort) + expected = pd.DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9], [7, 8, 9]]) + assert_frame_equal(result, expected) + + # append to list of DataFrames (2 dframes) + df1 = pd.DataFrame([[1, 2, 3], [4, 5, 6]]) + df2 = pd.DataFrame([[7, 8, 9]]) + result = df1.append([df2, df2], ignore_index=True, sort=sort) + expected = pd.DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9], [7, 8, 9]]) + assert_frame_equal(result, expected) + + def test_bad_input_type(self, sort): + # When appending a bad input type, the function + # should raise an exception. + + bad_input_msg = r'The value of other must be .*' + mixed_list_msg = r'When other is a list, its .*' + + # integer input + df = pd.DataFrame([[1, 2, 3], [4, 5, 6]]) + with pytest.raises(TypeError, match=bad_input_msg): + df.append(1, ignore_index=True, sort=sort) + + # string input + df = pd.DataFrame([[1, 2, 3], [4, 5, 6]]) + with pytest.raises(TypeError, match=bad_input_msg): + df.append("1 2 3", ignore_index=True, sort=sort) + + # tuple input + df = pd.DataFrame([[1, 2, 3], [4, 5, 6]]) + with pytest.raises(TypeError, match=bad_input_msg): + df.append((df, ), ignore_index=True, sort=sort) + + # list of integers + df = pd.DataFrame([[1, 2, 3], [4, 5, 6]]) + with pytest.raises(TypeError, match=bad_input_msg): + df.append([1], ignore_index=True, sort=sort) + + # list of strings + df = pd.DataFrame([[1, 2, 3], [4, 5, 6]]) + with pytest.raises(TypeError, match=bad_input_msg): + df.append(["1 2 3"], ignore_index=True, sort=sort) + + # list of lists + df = pd.DataFrame([[1, 2, 3], [4, 5, 6]]) + with pytest.raises(TypeError, match=bad_input_msg): + df.append([[df]], ignore_index=True, sort=sort) + + # list of tuples + df = pd.DataFrame([[1, 2, 3], [4, 5, 6]]) + with pytest.raises(TypeError, match=bad_input_msg): + df.append([(df, )], ignore_index=True, sort=sort) + + # mixed list + df = pd.DataFrame([[1, 2, 3], [4, 5, 6]]) + ser = pd.Series([7, 8, 9]) + dict = { + 0: 10, + 1: 11, + 2: 12 + } + with pytest.raises(TypeError, match=mixed_list_msg): + df.append([ser, dict], ignore_index=True, sort=sort) + with pytest.raises(TypeError, match=mixed_list_msg): + df.append([dict, ser], ignore_index=True, sort=sort) + + # mixed list with bad first element + # (when the first element is bad, display the + # bad input msg instead of the mixed list one) + df = pd.DataFrame([[1, 2, 3], [4, 5, 6]]) + ser = pd.Series([7, 8, 9]) + with pytest.raises(TypeError, match=bad_input_msg): + df.append([1, ser, ser], ignore_index=True, sort=sort) + + # mixed list with bad second element + df = pd.DataFrame([[1, 2, 3], [4, 5, 6]]) + ser = pd.Series([7, 8, 9]) + with pytest.raises(TypeError, match=mixed_list_msg): + df.append([ser, 1, ser], ignore_index=True, sort=sort) + + # mixed list with bad third element + df = pd.DataFrame([[1, 2, 3], [4, 5, 6]]) + ser = pd.Series([7, 8, 9]) + with pytest.raises(TypeError, match=mixed_list_msg): + df.append([ser, ser, 1], ignore_index=True, sort=sort) + + def test_no_unecessary_upcast(self, sort): + # GH: 22621 + # When appending, the resulting columns should + # not be float64 without necessity. + + # basic + df1 = pd.DataFrame([[1, 2, 3]]) + df2 = pd.DataFrame([[4, 5, 6]], index=[1]) + result = df1.append(df2, sort=sort) + expected = pd.DataFrame([[1, 2, 3], [4, 5, 6]]) + assert_frame_equal(result, expected) + + # 0 rows 0 columns + df1 = pd.DataFrame([[1, 2, 3]]) + df2 = pd.DataFrame() + result = df1.append(df2, sort=sort) + expected = df1.copy() + assert_frame_equal(result, expected) + + df1 = pd.DataFrame() + df2 = pd.DataFrame([[1, 2, 3]]) + result = df1.append(df2, sort=sort) + expected = df2.copy() + assert_frame_equal(result, expected) + + # 0 rows 2 columns + # (the original dtype (object) of the empty columns + # must be preserved) + df1 = pd.DataFrame([[1, 2, 3]], columns=[0, 1, 2]) + df2 = pd.DataFrame(columns=[3, 4]) + result = df1.append(df2, sort=sort) + expected = pd.DataFrame([[1, 2, 3, np.nan, np.nan]]) + expected[[3, 4]] = expected[[3, 4]].astype(object) + assert_frame_equal(result, expected) + + df1 = pd.DataFrame(columns=[0, 1]) + df2 = pd.DataFrame([[1, 2, 3]], columns=[2, 3, 4]) + result = df1.append(df2, sort=sort) + expected = pd.DataFrame([[np.nan, np.nan, 1, 2, 3]]) + expected[[0, 1]] = expected[[0, 1]].astype(object) + assert_frame_equal(result, expected) + + # big.append(small) + big = pd.DataFrame([[1, 2, 3]]) + small = pd.DataFrame([[4, 5]], index=[1]) + result = big.append(small, sort=sort) + expected = pd.DataFrame([[1, 2, 3], [4, 5, np.nan]]) + assert_frame_equal(result, expected) + + # small.append(big) + small = pd.DataFrame([[1, 2]]) + big = pd.DataFrame([[3, 4, 5]], index=[1]) + result = small.append(big, sort=sort) + expected = pd.DataFrame([[1, 2, np.nan], [3, 4, 5]]) + assert_frame_equal(result, expected) + + +class TestAppendColumnsIndex(object): + @pytest.mark.parametrize('idx_name3', [None, 'foo', 'bar', 'baz']) + @pytest.mark.parametrize('idx_name2', [None, 'foo', 'bar', 'baz']) + @pytest.mark.parametrize('idx_name1', [None, 'foo', 'bar', 'baz']) + def test_preserve_index_name(self, sort, idx_name1, idx_name2, idx_name3): + # When appending, the name of the indexes + # of the base DataFrame must always be + # preserved in the result. + + df1 = pd.DataFrame([[1, 2, 3]]) + df2 = pd.DataFrame([[4, 5, 6]], index=[1]) + df3 = pd.DataFrame([[7, 8, 9]], index=[2]) + + df1.columns.name = idx_name1 + df2.columns.name = idx_name2 + df3.columns.name = idx_name3 + + # append [] + result = df1.append([], sort=sort) + expected = df1.copy() + assert_frame_equal(result, expected) + + # append [df] + result = df1.append([df2], sort=sort) + expected = pd.DataFrame([[1, 2, 3], [4, 5, 6]]) + expected.columns.name = idx_name1 + assert_frame_equal(result, expected) + + # append [df, df] + result = df1.append([df2, df3], sort=sort) + expected = pd.DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) + expected.columns.name = idx_name1 + assert_frame_equal(result, expected) + + @pytest.mark.parametrize('index', indexes, ids=cls_name) + def test_preserve_index_type(self, sort, index): + # when there's only one index type in the inputs, + # it must be preserved in the output. + + # basic + df1 = pd.DataFrame([[1, 2, 3]], columns=index) + df2 = pd.DataFrame([[4, 5, 6]], index=[1], columns=index) + result = df1.append(df2, sort=sort) + expected = pd.DataFrame([[1, 2, 3], [4, 5, 6]], columns=index) + assert_frame_equal(result, expected) + + # big.append(small) + big = pd.DataFrame([[1, 2, 3]], columns=index) + small = pd.DataFrame([[4, 5]], index=[1], columns=index[:2]) + result = big.append(small, sort=sort) + expected = pd.DataFrame([[1, 2, 3], [4, 5, np.nan]], columns=index) + assert_frame_equal(result, expected) + + # small.append(big) + small = pd.DataFrame([[1, 2]], columns=index[:2]) + big = pd.DataFrame([[3, 4, 5]], index=[1], columns=index) + result = small.append(big, sort=sort) + expected = pd.DataFrame([[1, 2, np.nan], [3, 4, 5]], columns=index) + assert_frame_equal(result, expected) + + @pytest.mark.parametrize('index2', indexes, ids=cls_name) + @pytest.mark.parametrize('index1', indexes, ids=cls_name) + def test_preserve_index_values(self, sort, index1, index2): + # When appending indexes of different types, we want + # the resulting index to preserve the exact indexes + # values. + + df1 = pd.DataFrame([[1, 2, 3]], columns=index1) + df2 = pd.DataFrame([[4, 5, 6]], columns=index2, index=[1]) + result = df1.append(df2, sort=sort) + for value in index1: + assert value in result.columns + for value in index2: + assert value in result.columns + + def test_raise_on_duplicates(self, sort): + # Append should not allow DataFrames with repeated + # column names (or series with repeated row names). + + # dupe on base + df1 = pd.DataFrame([[1, 2, 3]], columns=['A', 'B', 'B']) + df2 = pd.DataFrame([[1, 2, 3]], columns=['A', 'B', 'C']) + with pytest.raises(InvalidIndexError): + df1.append([], sort=sort) + with pytest.raises(InvalidIndexError): + df1.append([df2], sort=sort) + with pytest.raises(InvalidIndexError): + df1.append([df2, df2], sort=sort) + + # dupe on other + df1 = pd.DataFrame([[1, 2, 3]], columns=['A', 'B', 'C']) + df2 = pd.DataFrame([[1, 2, 3]], columns=['A', 'B', 'B']) + with pytest.raises(InvalidIndexError): + df1.append([df2], sort=sort) + with pytest.raises(InvalidIndexError): + df1.append([df2, df2], sort=sort) + + # dupe on both + # (we could avoid raising errors here, but, to keep the api + # consistent, we don't) + df1 = pd.DataFrame([[1, 2, 3]], columns=['A', 'B', 'B']) + df2 = pd.DataFrame([[1, 2, 3]], columns=['A', 'B', 'B']) + with pytest.raises(InvalidIndexError): + df1.append([], sort=sort) + with pytest.raises(InvalidIndexError): + df1.append([df2], sort=sort) + with pytest.raises(InvalidIndexError): + df1.append([df2, df2], sort=sort) + + def test_nosort_basic(self): + # When sort=False, the resulting columns come + # in the order that they appear in the inputs. + + nan = np.nan + + # NUMERIC INDEX TESTS + + # append [] + df = pd.DataFrame([[1, 2, 3]], columns=[0, 1, 2]) + result = df.append([], sort=False) + expected = df[[0, 1, 2]] + assert_frame_equal(result, expected) + + df = pd.DataFrame([[1, 2, 3]], columns=[2, 1, 0]) + result = df.append([], sort=False) + expected = df[[2, 1, 0]] + assert_frame_equal(result, expected) + + # append [df] + df1 = pd.DataFrame([[1, 2]], columns=[0.0, 1.0]) + df2 = pd.DataFrame([[1, 2]], columns=[0.5, 1.5], index=[1]) + result = df1.append(df2, sort=False) + expected = pd.DataFrame([[1, 2, nan, nan], + [nan, nan, 1, 2]], + columns=[0.0, 1.0, 0.5, 1.5]) + assert_frame_equal(result, expected) + + # append [df, df] + df1 = pd.DataFrame([[1, 2]], columns=[0.0, 1.0]) + df2 = pd.DataFrame([[1, 2]], columns=[0.3, 1.3], index=[1]) + df3 = pd.DataFrame([[1, 2]], columns=[0.6, 1.6], index=[2]) + result = df1.append([df2, df3], sort=False) + expected = pd.DataFrame([[1, 2, nan, nan, nan, nan], + [nan, nan, 1, 2, nan, nan], + [nan, nan, nan, nan, 1, 2]], + columns=[0.0, 1.0, 0.3, 1.3, 0.6, 1.6]) + assert_frame_equal(result, expected) + + # STRING INDEX TESTS + + # append [] + df = pd.DataFrame([[1, 2, 3]], columns=['a', 'b', 'c']) + result = df.append([], sort=False) + expected = df[['a', 'b', 'c']] + assert_frame_equal(result, expected) + + df = pd.DataFrame([[1, 2, 3]], columns=['c', 'b', 'a']) + result = df.append([], sort=False) + expected = df[['c', 'b', 'a']] + assert_frame_equal(result, expected) + + # append [df] + df1 = pd.DataFrame([[1, 2]], columns=['a', 'c']) + df2 = pd.DataFrame([[1, 2]], columns=['b', 'd'], index=[1]) + result = df1.append(df2, sort=False) + expected = pd.DataFrame([[1, 2, nan, nan], + [nan, nan, 1, 2]], + columns=['a', 'c', 'b', 'd']) + assert_frame_equal(result, expected) + + # append [df, df] + df1 = pd.DataFrame([[1, 2]], columns=['a', 'd']) + df2 = pd.DataFrame([[1, 2]], columns=['b', 'e'], index=[1]) + df3 = pd.DataFrame([[1, 2]], columns=['c', 'f'], index=[2]) + result = df1.append([df2, df3], sort=False) + expected = pd.DataFrame([[1, 2, nan, nan, nan, nan], + [nan, nan, 1, 2, nan, nan], + [nan, nan, nan, nan, 1, 2]], + columns=['a', 'd', 'b', 'e', 'c', 'f']) + assert_frame_equal(result, expected) + + def test_sort_basic(self): + # When sort=True, the resulting columns must come + # out sorted. + + nan = np.nan + + # NUMERIC INDEX TESTS + + # append [] + df = pd.DataFrame([[1, 2, 3]], columns=[0, 1, 2]) + result = df.append([], sort=True) + expected = df[[0, 1, 2]] + assert_frame_equal(result, expected) + + df = pd.DataFrame([[1, 2, 3]], columns=[2, 1, 0]) + result = df.append([], sort=True) + expected = df[[0, 1, 2]] + assert_frame_equal(result, expected) + + # append [df] + df1 = pd.DataFrame([[1, 2]], columns=[0.0, 1.0]) + df2 = pd.DataFrame([[1, 2]], columns=[0.5, 1.5], index=[1]) + result = df1.append(df2, sort=True) + expected = pd.DataFrame([[1, nan, 2, nan], + [nan, 1, nan, 2]], + columns=[0.0, 0.5, 1.0, 1.5]) + assert_frame_equal(result, expected) + + # append [df, df] + df1 = pd.DataFrame([[1, 2]], columns=[0.0, 1.0]) + df2 = pd.DataFrame([[1, 2]], columns=[0.3, 1.3], index=[1]) + df3 = pd.DataFrame([[1, 2]], columns=[0.6, 1.6], index=[2]) + result = df1.append([df2, df3], sort=True) + expected = pd.DataFrame([[1, nan, nan, 2, nan, nan], + [nan, 1, nan, nan, 2, nan], + [nan, nan, 1, nan, nan, 2]], + columns=[0.0, 0.3, 0.6, 1.0, 1.3, 1.6]) + assert_frame_equal(result, expected) + + # STRING INDEX TESTS + + # append [] + df = pd.DataFrame([[1, 2, 3]], columns=['a', 'b', 'c']) + result = df.append([], sort=True) + expected = df[['a', 'b', 'c']] + assert_frame_equal(result, expected) + + df = pd.DataFrame([[1, 2, 3]], columns=['c', 'b', 'a']) + result = df.append([], sort=True) + expected = df[['a', 'b', 'c']] + assert_frame_equal(result, expected) + + # append [df] + df1 = pd.DataFrame([[1, 2]], columns=['a', 'c']) + df2 = pd.DataFrame([[1, 2]], columns=['b', 'd'], index=[1]) + result = df1.append(df2, sort=True) + expected = pd.DataFrame([[1, nan, 2, nan], + [nan, 1, nan, 2]], + columns=['a', 'b', 'c', 'd']) + assert_frame_equal(result, expected) + + # append [df, df] + df1 = pd.DataFrame([[1, 2]], columns=['a', 'd']) + df2 = pd.DataFrame([[1, 2]], columns=['b', 'e'], index=[1]) + df3 = pd.DataFrame([[1, 2]], columns=['c', 'f'], index=[2]) + result = df1.append([df2, df3], sort=True) + expected = pd.DataFrame([[1, nan, nan, 2, nan, nan], + [nan, 1, nan, nan, 2, nan], + [nan, nan, 1, nan, nan, 2]], + columns=['a', 'b', 'c', 'd', 'e', 'f']) + assert_frame_equal(result, expected) + + @pytest.mark.parametrize('index2', indexes, ids=cls_name) + @pytest.mark.parametrize('index1', indexes, ids=cls_name) + def test_index_types_without_sort(self, index1, index2): + # We should be able to append to a DataFrame + # regardless of the type of its index. + + # the code below should not raise any exceptions + df1 = pd.DataFrame([[1, 2, 3]], columns=index1) + df2 = pd.DataFrame([[4, 5, 6]], columns=index2, index=[1]) + df1.append([], sort=False) + df1.append([df2], sort=False) + df1.append([df2, df2], sort=False) + + @pytest.mark.parametrize( + 'index1, index2', + [(i1, i2) + for group in index_sort_groups + for i1, i2 in product(group, repeat=2)], + ids=cls_name + ) + def test_index_types_with_possible_sort(self, index1, index2): + # When the result of joining two indexes is sortable, + # we should not raise any exceptions. + + df1 = pd.DataFrame([[1, 2, 3]], columns=index1) + df2 = pd.DataFrame([[4, 5, 6]], columns=index2, index=[1]) + df1.append([], sort=True) # sorts the original frame + df1.append([df2], sort=True) + df1.append([df2, df2], sort=True) + + @pytest.mark.parametrize( + 'index1, index2', + [(i1, i2) + for g1, g2 in product(index_sort_groups, repeat=2) + # different sort groups + if type(g1[0]) != type(g2[0]) + for i1, i2 in product(g1, g2)], + ids=cls_name + ) + def test_index_types_with_impossible_sort(self, index1, index2): + # When the result of joining two indexes is not sortable, + # we should raise an exception. + + err_msg = r'The resulting columns could not be sorted.*' # TODO + + df1 = pd.DataFrame([[1, 2, 3]], columns=index1) + df2 = pd.DataFrame([[4, 5, 6]], columns=index2, index=[1]) + + with pytest.raises(TypeError, match=err_msg): + df1.append([df2], sort=True) + with pytest.raises(TypeError, match=err_msg): + df1.append([df2, df2], sort=True) + + +class TestAppendRowsIndex(object): + @pytest.mark.parametrize('idx_name3', [None, 'foo', 'bar', 'baz']) + @pytest.mark.parametrize('idx_name2', [None, 'foo', 'bar', 'baz']) + @pytest.mark.parametrize('idx_name1', [None, 'foo', 'bar', 'baz']) + def test_preserve_index_name(self, sort, idx_name1, idx_name2, idx_name3): + # When appending, the name of the indexes + # of the base DataFrame must always be + # preserved in the result. + + df1 = pd.DataFrame([[1, 2, 3]]) + df2 = pd.DataFrame([[4, 5, 6]], index=[1]) + df3 = pd.DataFrame([[7, 8, 9]], index=[2]) + + df1.index.name = idx_name1 + df2.index.name = idx_name2 + df3.index.name = idx_name3 + + # append [] + result = df1.append([], sort=sort) + expected = df1.copy() + assert_frame_equal(result, expected) + + # append [df] + result = df1.append([df2], sort=sort) + expected = pd.DataFrame([[1, 2, 3], [4, 5, 6]]) + expected.index.name = idx_name1 + assert_frame_equal(result, expected) + + # append [df, df] + result = df1.append([df2, df3], sort=sort) + expected = pd.DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) + expected.index.name = idx_name1 + assert_frame_equal(result, expected) + + @pytest.mark.parametrize('index', indexes, ids=cls_name) + def test_preserve_index_type(self, sort, index): + # when there's only one index type in the inputs, + # it must be preserved in the output. + + index1 = index[:1] + index2 = index[1:2] + index_comb = index1.append(index2) + + df1 = pd.DataFrame([[1, 2, 3]], index=index1) + df2 = pd.DataFrame([[4, 5, 6]], index=index2) + result = df1.append(df2, sort=sort) + expected = pd.DataFrame([[1, 2, 3], [4, 5, 6]], index=index_comb) + assert_frame_equal(result, expected) + + @pytest.mark.parametrize('index2', indexes, ids=cls_name) + @pytest.mark.parametrize('index1', indexes, ids=cls_name) + def test_preserve_index_values(self, sort, index1, index2): + # When appending indexes of different types, we want + # the resulting index to preserve the exact indexes + # values. + + df1 = pd.DataFrame([[1, 2, 3]], index=index1[:1]) + df2 = pd.DataFrame([[4, 5, 6]], index=index2[:1]) + result = df1.append(df2, sort=sort) + assert index1[0] in result.index + assert index2[0] in result.index + + def test_duplicates_without_verify_integrity(self): + # When verify_integrity=False, the function should + # allow duplicate values in the rows index. + + raise NotImplementedError + + def test_duplicates_with_verify_integrity(self): + # When verify_integrity=True, the function should + # not allow duplicate values in the rows index (whether + # in the input or output). + + raise NotImplementedError + + def test_ignore_index(self): + # When ignore_index=True, the function should completely + # ignore the input indexes and generate one that is brand + # new (RangeIndex). + + raise NotImplementedError + + def test_warning_ignore_index_and_verify_integrity(self): + # It makes no sense to set verify_integrity=True when + # ignore_index=True. To warn of a possible user + # misunderstanding, append should raise a warning in + # this situation. + + raise NotImplementedError + + +class TestAppendDangling(object): + """Tests that have not been concretized yet + """ + + def test_append_unnamed_series_raises(self, sort): + dict_msg = 'Can only append a dict if ignore_index=True' + df = pd.DataFrame([[1, 2, 3], [4, 5, 6]]) + dict = { + 0: 7, + 1: 8, + 2: 9 + } + with pytest.raises(TypeError, match=dict_msg): + df.append(dict, sort=sort) + with pytest.raises(TypeError, match=dict_msg): + df.append([dict], sort=sort) + with pytest.raises(TypeError, match=dict_msg): + df.append([dict, dict], sort=sort) + + series_msg = 'Can only append a Series if ignore_index=True or .*' + df = pd.DataFrame([[1, 2, 3], [4, 5, 6]]) + series = pd.Series([7, 8, 9]) + with pytest.raises(TypeError, match=series_msg): + df.append(series, sort=sort) + with pytest.raises(TypeError, match=series_msg): + df.append([series], sort=sort) + with pytest.raises(TypeError, match=series_msg): + df.append([series, series], sort=sort) + + indexes = [ + None, + pd.Index([0, 1]), + pd.Index(['a', 'b']), + pd.Index(['a', 'b'], name='foo') + ] + + @pytest.mark.parametrize('index1', indexes, ids=lambda x: repr(x)) + @pytest.mark.parametrize('index2', indexes, ids=lambda x: repr(x)) + def test_append_ignore_index(self, sort, index1, index2): + # when appending with ignore_index=True, + # all index content must be forgotten + df1 = pd.DataFrame([[1, 2, 3], [4, 5, 6]], index=index1) + df2 = pd.DataFrame([[1, 2, 3], [4, 5, 6]], index=index2) + + result = df1.append(df2, ignore_index=True) + expected = pd.DataFrame([[1, 2, 3], [4, 5, 6], + [1, 2, 3], [4, 5, 6]]) + assert_frame_equal(result, expected) + + result = df1.append([df2], ignore_index=True) + expected = pd.DataFrame([[1, 2, 3], [4, 5, 6], + [1, 2, 3], [4, 5, 6]]) + assert_frame_equal(result, expected) + + result = df1.append([df2, df2], ignore_index=True) + expected = pd.DataFrame([[1, 2, 3], [4, 5, 6], + [1, 2, 3], [4, 5, 6], + [1, 2, 3], [4, 5, 6]]) + assert_frame_equal(result, expected) diff --git a/pandas/tests/reshape/test_concat.py b/pandas/tests/reshape/test_concat.py index bb7a09bf7debd..b9a5caeb4a2e7 100644 --- a/pandas/tests/reshape/test_concat.py +++ b/pandas/tests/reshape/test_concat.py @@ -1009,44 +1009,6 @@ def test_append_missing_column_proper_upcast(self, sort): assert appended['A'].dtype == 'f8' assert appended['B'].dtype == 'O' - def test_append_series_no_unecessary_upcast(self, sort): - # case 1 - # append to DataFrame with no rows - # columns from the Series should preserve the Series dtype - - df = pd.DataFrame() - series = pd.Series([1, 2, 3], name=0) - result = df.append(series, sort=sort) - expected = pd.DataFrame([[1, 2, 3]]) - assert_frame_equal(result, expected) - - df = pd.DataFrame(columns=[0]) - series = pd.Series([1, 2, 3], index=[1, 2, 3], name=0) - result = df.append(series, sort=sort) - expected = pd.DataFrame([[np.nan, 1, 2, 3]], columns=[0, 1, 2, 3]) - expected[0] = expected[0].astype(object) # original dtype of df - assert_frame_equal(result, expected) - - # case 2 - # append to DataFrame bigger than Series - # columns that come from both should preserve the dtype - - df = pd.DataFrame([[1, 2, 3, 4]]) - series = pd.Series([1, 2, 3], name=1) - result = df.append(series, sort=sort) - expected = pd.DataFrame([[1, 2, 3, 4.], [1, 2, 3, np.nan]]) - assert_frame_equal(result, expected) - - # case 3 - # append to DataFrame smaller than Series - # columns that come from both should preserve the dtype - - df = pd.DataFrame([[1, 2, 3]]) - series = pd.Series([1, 2, 3, 4], name=1) - result = df.append(series, sort=sort) - expected = pd.DataFrame([[1, 2, 3, np.nan], [1, 2, 3, 4.]]) - assert_frame_equal(result, expected) - class TestConcatenate(ConcatenateBase): From 0d4a832f6334a6cfe9ddd64ac7c8a4221544c64b Mon Sep 17 00:00:00 2001 From: araraonline Date: Wed, 26 Sep 2018 18:53:06 -0300 Subject: [PATCH 18/30] Implement append reindexing code Because concat does not have a clear reindexing behavior on the columns, I've implemented this behavior separately on DataFrame.append. Actually, the reindexing is implemented at pandas/code/indexes/api.py and is called from DataFrame.append. The main catch here is that any column types are allowed to be concatenated and, when sort is not possible, it raises an error. Another thing that was added was xfail marks on the tests. These represents parts of the code that haven't been implemented yet or fixes that are better on another PR. The behavior for sort=None isn't totally sorted out yet. --- pandas/core/frame.py | 55 ++++++----- pandas/core/indexes/api.py | 141 +++++++++++++++++++++++++++- pandas/core/indexes/base.py | 4 + pandas/tests/reshape/test_append.py | 70 +++++++++++++- 4 files changed, 238 insertions(+), 32 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 36a1aba411d1c..fddad2a6a4287 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -6420,12 +6420,42 @@ def _append_list_of_frames(self, other, *args, **kwargs): _obj_type = kwargs['_obj_type'] _item_type = kwargs.get('_item_type') + from pandas.core.indexes.api import ( + CannotSortError, + _normalize_dataframes, + ) from pandas.core.reshape.concat import concat + # The default value of sort in version 0.23.0 is None. + # The behavior when this was the value is very + # varied and changes according to input type, columns index + # type, whether a reindex is necessary or not, etc. + # + # The code below is a try to reproduce the old behavior, + # but note that this is deprecated. + # + # TODO: handle sort=None here + + # The behavior of concat is a bit problematic as it is. To get around + # this, we prepare the DataFrames before feeding them into concat. to_concat = [self] + other - result = concat(to_concat, ignore_index=ignore_index, + try: + to_concat_norm = _normalize_dataframes(to_concat, sort=sort) + except CannotSortError: + raise TypeError("The resulting columns could not be sorted." + " You can try setting sort=False or use" + " compatible index types.") + result = concat(to_concat_norm, ignore_index=ignore_index, verify_integrity=verify_integrity, sort=sort) + # preserve base DataFrame indexes names + # XXX: how will this work with MultiIndex (?) + result.columns.name = self.columns.name + if not ignore_index: + result.index.name = self.index.name + + # the conditionals below will be refactored or removed + if sort is None: # The sorting behaviour for None was weird. # It is getting deprecated. @@ -6452,19 +6482,6 @@ def _append_list_of_frames(self, other, *args, **kwargs): else: sort = True - if not sort: - # Concat sorts the column indexes if they are 'special'. - # We don't want this behaviour if sort is False. - result_idx = self.columns - for frame in other: - column_idx = frame.columns - idx_diff = column_idx.difference(result_idx) - try: - result_idx = result_idx.append(idx_diff) - except TypeError: - result_idx = result_idx.astype(object).append(idx_diff) - result = result.reindex(columns=result_idx, copy=False) - if result.shape[0] == 1: # If we got only one row of result, this means that # the resulting dtypes can match the dframe where @@ -6479,16 +6496,6 @@ def _append_list_of_frames(self, other, *args, **kwargs): base_dtypes = base_frame.dtypes result[base_columns] = result[base_columns].astype(base_dtypes) - if not ignore_index: - # We want to keep the index name of the original dframe (self). - # Rename the index after concat erases it. - result.index.name = self.index.name - - # keep the same column index name as - # the original dframe (self) - # XXX: will this break anything in MultiIndex? - result.columns.name = self.columns.name - return result def join(self, other, on=None, how='left', lsuffix='', rsuffix='', diff --git a/pandas/core/indexes/api.py b/pandas/core/indexes/api.py index e50a4b099a8e1..42f4b0c0fb0fa 100644 --- a/pandas/core/indexes/api.py +++ b/pandas/core/indexes/api.py @@ -1,11 +1,20 @@ import textwrap import warnings -from pandas.core.indexes.base import (Index, - _new_Index, - ensure_index, - ensure_index_from_sequences, - InvalidIndexError) # noqa +from pandas.core.dtypes.generic import ( + ABCCategoricalIndex, + ABCIntervalIndex, + ABCMultiIndex, + ABCPeriodIndex, +) +from pandas.core.indexes.base import ( + Index, + _new_Index, + ensure_index, + ensure_index_from_sequences, + CannotSortError, + InvalidIndexError +) from pandas.core.indexes.category import CategoricalIndex # noqa from pandas.core.indexes.multi import MultiIndex # noqa from pandas.core.indexes.interval import IntervalIndex # noqa @@ -160,3 +169,125 @@ def _all_indexes_same(indexes): if not first.equals(index): return False return True + + +def _normalize_dataframes(frame_list, verify_inputs=True, sort=False): + """Normalize the columns from a list of DataFrames + + First, an index is created by merging all the original columns. Then, + all columns are reindexed to match this new index. + + Parameters + ---------- + index_list: list of Index objects + verify_inputs: boolean, default True + Verify if the input indexes contain overlapping values. + sort: boolean, default False + Order result index. If False, values will come in the order they + appear. + + Raises + ------ + CannotSortError + When sort=True and the result index is not sortable. + InvalidIndexError + When verify_inputs=True and 1+ of the indexes contain duplicates. + """ + orig_columns = [df.columns for df in frame_list] + merged_columns = _merge_index_list(orig_columns, verify_inputs, sort) + return [_reindex(df, merged_columns, axis=1) for df in frame_list] + + +def _merge_index_list(index_list, verify_inputs=True, sort=False): + """Merge a list of indexes into one big index + + Parameters + ---------- + index_list: list of Index objects + verify_inputs: boolean, default True + Verify if the input indexes contain overlapping values. + sort: boolean, default False + Order result index. If False, values will come in the order they + appear. + + Raises + ------ + CannotSortError + When sort=True and the result index is not sortable. + InvalidIndexError + When verify_inputs=True and 1+ of the indexes contain duplicates. + + Examples + -------- + """ + if verify_inputs: + if any([ix.has_duplicates for ix in index_list]): + raise InvalidIndexError("Input index has duplicate values") + + result = index_list[0] + for idx in index_list[1:]: + result = _merge_indexes(result, idx) + + return result if not sort else _sort_index(result) + + +def _merge_indexes(index1, index2): + """Merge two indexes together + """ + + # lots of exception handling because we want to allow any + # indexes types to be merged together + + try: + difference = index2.difference(index1) + except (TypeError, ValueError): + if isinstance(index2, (ABCIntervalIndex, ABCPeriodIndex)): + index2 = index2.astype(object) + difference = index2.difference(index1) + else: + raise + + try: + return index1.append(difference) + except TypeError: + if isinstance(index1, ABCCategoricalIndex): + index1 = index1.astype(object) + return index1.append(difference) + raise + + +def _sort_index(index): + """Sort index and raises when not possible + """ + try: + return index.sort_values() + except TypeError: + raise CannotSortError + + +def _reindex(df, new_index, axis=0): + """Reindex df axis to match new_index + + Parameters + ---------- + + df: a DataFrame object + new_index: an Index object + axis: int or str, default 0 + + Notes + ----- + + Works the same as DataFrame.reindex, but handles IntervalIndex and + MultiIndex errors. + """ + try: + return df.reindex(columns=new_index, copy=False) + except TypeError: + if isinstance(df.columns, ABCIntervalIndex): + df.columns = df.columns.astype(object) + elif isinstance(df.columns, ABCMultiIndex): + df.columns = df.columns.values + else: + raise + return df.reindex(columns=new_index, copy=False) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index b2b6e02e908c5..40ce8faeba6bb 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -158,6 +158,10 @@ def index_arithmetic_method(self, other): return set_function_name(index_arithmetic_method, name, cls) +class CannotSortError(Exception): + pass + + class InvalidIndexError(Exception): pass diff --git a/pandas/tests/reshape/test_append.py b/pandas/tests/reshape/test_append.py index d096f01a6dda9..6e13532d1181b 100644 --- a/pandas/tests/reshape/test_append.py +++ b/pandas/tests/reshape/test_append.py @@ -361,14 +361,44 @@ def test_preserve_index_type(self, sort, index): @pytest.mark.parametrize('index2', indexes, ids=cls_name) @pytest.mark.parametrize('index1', indexes, ids=cls_name) - def test_preserve_index_values(self, sort, index1, index2): + def test_preserve_index_values_without_sort(self, index1, index2): # When appending indexes of different types, we want # the resulting index to preserve the exact indexes # values. + # Related to GH13626 + from pandas.core.dtypes.generic import ( + ABCDatetimeIndex, ABCMultiIndex, ABCTimedeltaIndex + ) + if isinstance(index1, ABCMultiIndex): + if isinstance(index2, ABCDatetimeIndex): + pytest.xfail("MultiIndex + DatetimeIndex produces bad value") + if isinstance(index2, ABCTimedeltaIndex): + pytest.xfail("MultiIndex + TimedeltaIndex produces bad value") + df1 = pd.DataFrame([[1, 2, 3]], columns=index1) df2 = pd.DataFrame([[4, 5, 6]], columns=index2, index=[1]) - result = df1.append(df2, sort=sort) + result = df1.append(df2, sort=False) + for value in index1: + assert value in result.columns + for value in index2: + assert value in result.columns + + @pytest.mark.parametrize( + 'index1, index2', + [(i1, i2) + for group in index_sort_groups + for i1, i2 in product(group, repeat=2)], + ids=cls_name + ) + def test_preserve_index_values_with_sort(self, index1, index2): + # When appending indexes of different types, we want + # the resulting index to preserve the exact indexes + # values. + + df1 = pd.DataFrame([[1, 2, 3]], columns=index1) + df2 = pd.DataFrame([[4, 5, 6]], columns=index2, index=[1]) + result = df1.append(df2, sort=True) for value in index1: assert value in result.columns for value in index2: @@ -558,6 +588,12 @@ def test_index_types_without_sort(self, index1, index2): # We should be able to append to a DataFrame # regardless of the type of its index. + # TODO: check end of append and create tests (empty / IntervalIndex) + # TODO: implement different way for df.append([]) + from pandas.core.dtypes.generic import ABCIntervalIndex + if isinstance(index1, ABCIntervalIndex): + pytest.xfail("Cannot do df[interval] for IntervalIndex") + # the code below should not raise any exceptions df1 = pd.DataFrame([[1, 2, 3]], columns=index1) df2 = pd.DataFrame([[4, 5, 6]], columns=index2, index=[1]) @@ -576,6 +612,12 @@ def test_index_types_with_possible_sort(self, index1, index2): # When the result of joining two indexes is sortable, # we should not raise any exceptions. + # TODO: check end of append and create tests (empty / IntervalIndex) + # TODO: implement different way for df.append([]) + from pandas.core.dtypes.generic import ABCIntervalIndex + if isinstance(index1, ABCIntervalIndex): + pytest.xfail("Cannot do df[interval] for IntervalIndex") + df1 = pd.DataFrame([[1, 2, 3]], columns=index1) df2 = pd.DataFrame([[4, 5, 6]], columns=index2, index=[1]) df1.append([], sort=True) # sorts the original frame @@ -595,7 +637,13 @@ def test_index_types_with_impossible_sort(self, index1, index2): # When the result of joining two indexes is not sortable, # we should raise an exception. - err_msg = r'The resulting columns could not be sorted.*' # TODO + # TODO: check end of append and create tests (empty / IntervalIndex) + # TODO: implement different way for df.append([]) + from pandas.core.dtypes.generic import ABCIntervalIndex + if isinstance(index1, ABCIntervalIndex): + pytest.xfail("Cannot do df[interval] for IntervalIndex") + + err_msg = r'The resulting columns could not be sorted\..*' df1 = pd.DataFrame([[1, 2, 3]], columns=index1) df2 = pd.DataFrame([[4, 5, 6]], columns=index2, index=[1]) @@ -662,6 +710,22 @@ def test_preserve_index_values(self, sort, index1, index2): # the resulting index to preserve the exact indexes # values. + # Related to GH13626 + from pandas.core.dtypes.generic import ( + ABCDatetimeIndex, ABCMultiIndex, ABCTimedeltaIndex + ) + if isinstance(index1, ABCMultiIndex): + if isinstance(index2, ABCDatetimeIndex): + pytest.xfail("MultiIndex + DatetimeIndex produces bad value") + if isinstance(index2, ABCTimedeltaIndex): + pytest.xfail("MultiIndex + TimedeltaIndex produces bad value") + + # Concat raises a TypeError when appending a CategoricalIndex + # with another type + from pandas.core.dtypes.generic import ABCCategoricalIndex + if isinstance(index1, ABCCategoricalIndex): + pytest.xfail("Cannot have a CategoricalIndex append to another typ") + df1 = pd.DataFrame([[1, 2, 3]], index=index1[:1]) df2 = pd.DataFrame([[4, 5, 6]], index=index2[:1]) result = df1.append(df2, sort=sort) From c07e84c5fbfc8fb847f9435aa4552fd91f022938 Mon Sep 17 00:00:00 2001 From: araraonline Date: Thu, 27 Sep 2018 14:50:02 -0300 Subject: [PATCH 19/30] Convert result columns to specific dtypes --- pandas/core/frame.py | 22 +++++----- pandas/tests/reshape/test_append.py | 65 +++++++++++++++++++++++++++++ 2 files changed, 75 insertions(+), 12 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index fddad2a6a4287..27b222b8024ce 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -6483,18 +6483,16 @@ def _append_list_of_frames(self, other, *args, **kwargs): sort = True if result.shape[0] == 1: - # If we got only one row of result, this means that - # the resulting dtypes can match the dframe where - # they come from. - # - # Concat achieves this behaviour when concatenating - # an empty DataFrame, but not if it has some columns. - # - # This is a hack for retrieving the dtypes back. - base_frame = [frame for frame in to_concat if frame.shape[0] == 1][0] - base_columns = base_frame.columns - base_dtypes = base_frame.dtypes - result[base_columns] = result[base_columns].astype(base_dtypes) + from pandas.core.dtypes.cast import find_common_type + + # Reindexing the columns created an artificial float64 where it + # was not needed. We can convert the columns back to the expected + # type. + + for col in result: + types = [df[col].dtype for df in to_concat if col in df] + common_type = find_common_type(types) + result[col] = result[col].astype(common_type) return result diff --git a/pandas/tests/reshape/test_append.py b/pandas/tests/reshape/test_append.py index 6e13532d1181b..85d47f4e9c68e 100644 --- a/pandas/tests/reshape/test_append.py +++ b/pandas/tests/reshape/test_append.py @@ -298,6 +298,71 @@ def test_no_unecessary_upcast(self, sort): expected = pd.DataFrame([[1, 2, np.nan], [3, 4, 5]]) assert_frame_equal(result, expected) + def test_preserve_empty_columns_dtype(self, sort): + # When appending to an empty DataFrame with columns, the dtype of these + # columns should be accounted for the output. + + # append same size (default dtype) + df1 = pd.DataFrame(columns=list('ABC')) # object + df2 = pd.DataFrame([[1, 2, 3]], columns=list('ABC')) + + result1 = df1.append(df2, sort=sort) + result2 = df2.append(df1, sort=sort) + + expected = df2.astype(object) + assert_frame_equal(result1, expected) + assert_frame_equal(result2, expected) + + # GH: 22858 - df1 ends up float64 + # append same size (int64) + # df1 = pd.DataFrame(columns=list('ABC'), dtype='int64') + # df2 = pd.DataFrame([[1, 2, 3]], columns=list('ABC')) + + # result1 = df1.append(df2, sort=sort) + # result2 = df2.append(df1, sort=sort) + + # expected = df2.astype('int64') # same as df2 + # assert_frame_equal(result1, expected) + # assert_frame_equal(result2, expected) + + # append same size (float64) + df1 = pd.DataFrame(columns=list('ABC'), dtype='float64') + df2 = pd.DataFrame([[1, 2, 3]], columns=list('ABC')) + + result1 = df1.append(df2, sort=sort) + result2 = df2.append(df1, sort=sort) + + expected = df2.astype('float64') + assert_frame_equal(result1, expected) + assert_frame_equal(result2, expected) + + # append small/big - small empty + small = pd.DataFrame(columns=list('AB')) + big = pd.DataFrame([[1, 2, 3]], columns=list('ABC')) + + result1 = small.append(big, sort=sort) + result2 = big.append(small, sort=sort) + + expected = big.copy() + expected[['A', 'B']] = expected[['A', 'B']].astype(object) + assert_frame_equal(result1, expected) + assert_frame_equal(result2, expected) + + # append small/big - big empty + small = pd.DataFrame([[1, 2]], columns=list('AB')) + big = pd.DataFrame(columns=list('ABC')) + + result1 = small.append(big, sort=sort) + result2 = big.append(small, sort=sort) + + expected = pd.DataFrame( + [[1, 2, np.nan]], + columns=list('ABC'), + dtype=object + ) + assert_frame_equal(result1, expected) + assert_frame_equal(result2, expected) + class TestAppendColumnsIndex(object): @pytest.mark.parametrize('idx_name3', [None, 'foo', 'bar', 'baz']) From cb28274aafc52e02e537881875078f43cdf053a8 Mon Sep 17 00:00:00 2001 From: araraonline Date: Thu, 27 Sep 2018 21:03:39 -0300 Subject: [PATCH 20/30] Fix _normalize_dataframes dtypes problem The resulting index dtype was being inferred (not always?) from the indexes values. This caused a subtle problem where we inferred in a place the user didn't want to. For example: >>> df1.columns Index([0, 1, 2], dtype='object') >>> df1.append(df1, sort=False).columns Int64Index([0, 1, 2], dtype='int64') This commit solves this problem, but it raises a question for empty indexes, should they be considered when calculating the final dtype or not? My intuition says that the user usually doesn't know about the index type, specially if it's empty, so we may avoid friction ignoring empty indexes in the calculation. The same argument may be used for column dtypes after an append where the resulting DataFrame has exactly one row. --- pandas/core/indexes/api.py | 22 +++++++++++++++++++--- pandas/tests/reshape/test_append.py | 4 +++- 2 files changed, 22 insertions(+), 4 deletions(-) diff --git a/pandas/core/indexes/api.py b/pandas/core/indexes/api.py index 42f4b0c0fb0fa..f70dee43c112d 100644 --- a/pandas/core/indexes/api.py +++ b/pandas/core/indexes/api.py @@ -195,6 +195,25 @@ def _normalize_dataframes(frame_list, verify_inputs=True, sort=False): """ orig_columns = [df.columns for df in frame_list] merged_columns = _merge_index_list(orig_columns, verify_inputs, sort) + + # Because _merge_index_list may infer the index dtype based on values, + # we have to provide a workaround to conserve the original dtype. + # + # Empty indexes come from DataFrames with no columns, and we do not + # consider them when calculating the final index dtype. + # + # XXX: goes against DataFrame.append behavior for empty columns, where we + # let them be object dtype. + # + # What behavior should be adopted? + relevant_cols = [i for i in orig_columns + if not (len(i) == 0 and i.dtype == 'object')] + if relevant_cols: + from pandas.core.dtypes.cast import find_common_type + types = [i.dtype for i in relevant_cols] + common_type = find_common_type(types) + merged_columns = merged_columns.astype(common_type) + return [_reindex(df, merged_columns, axis=1) for df in frame_list] @@ -216,9 +235,6 @@ def _merge_index_list(index_list, verify_inputs=True, sort=False): When sort=True and the result index is not sortable. InvalidIndexError When verify_inputs=True and 1+ of the indexes contain duplicates. - - Examples - -------- """ if verify_inputs: if any([ix.has_duplicates for ix in index_list]): diff --git a/pandas/tests/reshape/test_append.py b/pandas/tests/reshape/test_append.py index 85d47f4e9c68e..d848b06857a2e 100644 --- a/pandas/tests/reshape/test_append.py +++ b/pandas/tests/reshape/test_append.py @@ -19,6 +19,7 @@ pd.Int64Index([3, 4, 5]), pd.UInt64Index([6, 7, 8]), pd.Float64Index([3.5, 4.5, 5.5]), + pd.Index([9, 10, 11], dtype=object), # fake int64 # datetime pd.to_datetime(['2013-01-01', '2013-01-10', '2013-01-15']), @@ -51,7 +52,8 @@ pd.RangeIndex(3), pd.Int64Index([3, 4, 5]), pd.UInt64Index([6, 7, 8]), - pd.Float64Index([3.5, 4.5, 5.5])], + pd.Float64Index([3.5, 4.5, 5.5]), + pd.Index([9, 10, 11], dtype=object)], [pd.to_datetime(['2013-01-01', '2013-01-10', '2013-01-15'])], [pd.to_timedelta(['1 day', '2 days', '3 days'])], From 6cf42b4edd1126cd51e1209d5e71529b77ebd4af Mon Sep 17 00:00:00 2001 From: araraonline Date: Fri, 28 Sep 2018 16:55:28 -0300 Subject: [PATCH 21/30] Add named index to test cases --- pandas/tests/reshape/test_append.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/tests/reshape/test_append.py b/pandas/tests/reshape/test_append.py index d848b06857a2e..c45042c92515e 100644 --- a/pandas/tests/reshape/test_append.py +++ b/pandas/tests/reshape/test_append.py @@ -13,6 +13,7 @@ # base pd.Index(['A', 'B', 'C']), + pd.Index(['A', 'B', 'C'], name='foo'), # numeric pd.RangeIndex(3), From adc6a2f9a81594a6b1305ef4032be6f84791ffeb Mon Sep 17 00:00:00 2001 From: araraonline Date: Sat, 29 Sep 2018 20:26:07 -0300 Subject: [PATCH 22/30] Ignore empty columns dtype on append TODO: This shall be reversed later, or be made a bit more strict. My best choice is: ignore when it is empty of dtype object, consider if it is empty of another dtype. May interact somewhat with the result float64 of reindex. --- pandas/core/frame.py | 16 +++---- pandas/tests/reshape/test_append.py | 71 +---------------------------- 2 files changed, 7 insertions(+), 80 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 27b222b8024ce..c715264fc5b36 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -6482,17 +6482,13 @@ def _append_list_of_frames(self, other, *args, **kwargs): else: sort = True + # Reindexing the columns created an artificial float64 where it + # was not needed. We can convert the columns back to the expected + # type. if result.shape[0] == 1: - from pandas.core.dtypes.cast import find_common_type - - # Reindexing the columns created an artificial float64 where it - # was not needed. We can convert the columns back to the expected - # type. - - for col in result: - types = [df[col].dtype for df in to_concat if col in df] - common_type = find_common_type(types) - result[col] = result[col].astype(common_type) + base_frame = next(df for df in to_concat_norm if df.shape[0] == 1) + dtypes = base_frame.dtypes.to_dict() + result = result.astype(dtypes) # won't work well dups cols return result diff --git a/pandas/tests/reshape/test_append.py b/pandas/tests/reshape/test_append.py index c45042c92515e..33af7231471c0 100644 --- a/pandas/tests/reshape/test_append.py +++ b/pandas/tests/reshape/test_append.py @@ -247,7 +247,7 @@ def test_bad_input_type(self, sort): def test_no_unecessary_upcast(self, sort): # GH: 22621 - # When appending, the resulting columns should + # When appending, the result columns should # not be float64 without necessity. # basic @@ -271,20 +271,16 @@ def test_no_unecessary_upcast(self, sort): assert_frame_equal(result, expected) # 0 rows 2 columns - # (the original dtype (object) of the empty columns - # must be preserved) df1 = pd.DataFrame([[1, 2, 3]], columns=[0, 1, 2]) df2 = pd.DataFrame(columns=[3, 4]) result = df1.append(df2, sort=sort) expected = pd.DataFrame([[1, 2, 3, np.nan, np.nan]]) - expected[[3, 4]] = expected[[3, 4]].astype(object) assert_frame_equal(result, expected) df1 = pd.DataFrame(columns=[0, 1]) df2 = pd.DataFrame([[1, 2, 3]], columns=[2, 3, 4]) result = df1.append(df2, sort=sort) expected = pd.DataFrame([[np.nan, np.nan, 1, 2, 3]]) - expected[[0, 1]] = expected[[0, 1]].astype(object) assert_frame_equal(result, expected) # big.append(small) @@ -301,71 +297,6 @@ def test_no_unecessary_upcast(self, sort): expected = pd.DataFrame([[1, 2, np.nan], [3, 4, 5]]) assert_frame_equal(result, expected) - def test_preserve_empty_columns_dtype(self, sort): - # When appending to an empty DataFrame with columns, the dtype of these - # columns should be accounted for the output. - - # append same size (default dtype) - df1 = pd.DataFrame(columns=list('ABC')) # object - df2 = pd.DataFrame([[1, 2, 3]], columns=list('ABC')) - - result1 = df1.append(df2, sort=sort) - result2 = df2.append(df1, sort=sort) - - expected = df2.astype(object) - assert_frame_equal(result1, expected) - assert_frame_equal(result2, expected) - - # GH: 22858 - df1 ends up float64 - # append same size (int64) - # df1 = pd.DataFrame(columns=list('ABC'), dtype='int64') - # df2 = pd.DataFrame([[1, 2, 3]], columns=list('ABC')) - - # result1 = df1.append(df2, sort=sort) - # result2 = df2.append(df1, sort=sort) - - # expected = df2.astype('int64') # same as df2 - # assert_frame_equal(result1, expected) - # assert_frame_equal(result2, expected) - - # append same size (float64) - df1 = pd.DataFrame(columns=list('ABC'), dtype='float64') - df2 = pd.DataFrame([[1, 2, 3]], columns=list('ABC')) - - result1 = df1.append(df2, sort=sort) - result2 = df2.append(df1, sort=sort) - - expected = df2.astype('float64') - assert_frame_equal(result1, expected) - assert_frame_equal(result2, expected) - - # append small/big - small empty - small = pd.DataFrame(columns=list('AB')) - big = pd.DataFrame([[1, 2, 3]], columns=list('ABC')) - - result1 = small.append(big, sort=sort) - result2 = big.append(small, sort=sort) - - expected = big.copy() - expected[['A', 'B']] = expected[['A', 'B']].astype(object) - assert_frame_equal(result1, expected) - assert_frame_equal(result2, expected) - - # append small/big - big empty - small = pd.DataFrame([[1, 2]], columns=list('AB')) - big = pd.DataFrame(columns=list('ABC')) - - result1 = small.append(big, sort=sort) - result2 = big.append(small, sort=sort) - - expected = pd.DataFrame( - [[1, 2, np.nan]], - columns=list('ABC'), - dtype=object - ) - assert_frame_equal(result1, expected) - assert_frame_equal(result2, expected) - class TestAppendColumnsIndex(object): @pytest.mark.parametrize('idx_name3', [None, 'foo', 'bar', 'baz']) From 87cc87885dffc5923816eba4e87dcfe103bf63d4 Mon Sep 17 00:00:00 2001 From: araraonline Date: Sat, 29 Sep 2018 20:38:57 -0300 Subject: [PATCH 23/30] Remove sort=None handling Will be better made in a future version. --- pandas/core/frame.py | 28 ---------------------------- 1 file changed, 28 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index c715264fc5b36..883a43a929bed 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -6454,34 +6454,6 @@ def _append_list_of_frames(self, other, *args, **kwargs): if not ignore_index: result.index.name = self.index.name - # the conditionals below will be refactored or removed - - if sort is None: - # The sorting behaviour for None was weird. - # It is getting deprecated. - # - # By now, fix tests by only sorting when the - # original 'other' was a series or a dict. - if _obj_type in (dict, Series): - sort = False - elif _item_type in (dict, Series): - # A list of dicts/Series had a different behaviour - # when sorting is None. - # - # We do not sort if the 'other' columns are all - # contained in self.columns. Otherwise we do - # sort. - # - # TODO: as per documentation, this seems like the original - # behaviour intended for append. Should I implement this - # for any inputs that come? - self_idx = self.columns - other_idx = other[0].columns - idx_diff = other_idx.difference(self_idx) - sort = len(idx_diff) > 0 - else: - sort = True - # Reindexing the columns created an artificial float64 where it # was not needed. We can convert the columns back to the expected # type. From eee2daea0a794ec0a7be13c7109ae427ca6d8df4 Mon Sep 17 00:00:00 2001 From: araraonline Date: Sat, 29 Sep 2018 20:33:36 -0300 Subject: [PATCH 24/30] Modify behavior of append on duplicates When there were duplicates on the columns index, sort was allowed and duplicates were allowed if the indexes had the same values (as found by idx.tolist()). Now, considering that pandas doesn't allow to sort the index when there are duplicate values (DataFrame.reindex fails) and that searching for the same values is counter-productive and prone to fail, depending on the different types of indexes, the behavior was modified to this: - When sort=True and there are duplicates in at least one index, an error is raised and append stops. - Dframes with duplicate indexes are only considered to be joined when the indexes share the same identity (that is, they are the same object comparable with `idx1 is idx2`) Some other improvements to the code have also been made and I believe it is better in a general mode. --- pandas/core/frame.py | 28 ++---- pandas/core/indexes/api.py | 106 +++++++++++++++----- pandas/core/indexes/base.py | 4 - pandas/tests/reshape/test_append.py | 147 +++++++++++++++++++++------- 4 files changed, 204 insertions(+), 81 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 883a43a929bed..0e58a5d1b3591 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -6420,31 +6420,15 @@ def _append_list_of_frames(self, other, *args, **kwargs): _obj_type = kwargs['_obj_type'] _item_type = kwargs.get('_item_type') - from pandas.core.indexes.api import ( - CannotSortError, - _normalize_dataframes, - ) + from pandas.core.indexes.api import _normalize_dataframes from pandas.core.reshape.concat import concat - # The default value of sort in version 0.23.0 is None. - # The behavior when this was the value is very - # varied and changes according to input type, columns index - # type, whether a reindex is necessary or not, etc. - # - # The code below is a try to reproduce the old behavior, - # but note that this is deprecated. - # - # TODO: handle sort=None here - - # The behavior of concat is a bit problematic as it is. To get around - # this, we prepare the DataFrames before feeding them into concat. + # TODO: sorting behavior when sort=None + + # The behavior of concat is a bit problematic as it is. To get around, + # we prepare the DataFrames before feeding them into concat. to_concat = [self] + other - try: - to_concat_norm = _normalize_dataframes(to_concat, sort=sort) - except CannotSortError: - raise TypeError("The resulting columns could not be sorted." - " You can try setting sort=False or use" - " compatible index types.") + to_concat_norm = _normalize_dataframes(to_concat, sort=sort) result = concat(to_concat_norm, ignore_index=ignore_index, verify_integrity=verify_integrity, sort=sort) diff --git a/pandas/core/indexes/api.py b/pandas/core/indexes/api.py index f70dee43c112d..f4d67ec6649a4 100644 --- a/pandas/core/indexes/api.py +++ b/pandas/core/indexes/api.py @@ -12,8 +12,7 @@ _new_Index, ensure_index, ensure_index_from_sequences, - CannotSortError, - InvalidIndexError + InvalidIndexError, ) from pandas.core.indexes.category import CategoricalIndex # noqa from pandas.core.indexes.multi import MultiIndex # noqa @@ -38,6 +37,18 @@ """) +class _CannotSortError(Exception): + pass + + +class _CannotSortDuplicatesError(Exception): + pass + + +class _DuplicatesError(Exception): + pass + + # TODO: there are many places that rely on these private methods existing in # pandas.core.index __all__ = ['Index', 'MultiIndex', 'NumericIndex', 'Float64Index', 'Int64Index', @@ -181,20 +192,40 @@ def _normalize_dataframes(frame_list, verify_inputs=True, sort=False): ---------- index_list: list of Index objects verify_inputs: boolean, default True - Verify if the input indexes contain overlapping values. + Verify if the input indexes contain duplicate values. Ignored when all + input indexes share the same identity (a is b). sort: boolean, default False - Order result index. If False, values will come in the order they + Order resulting index. If False, values will come in the order they appear. Raises ------ - CannotSortError - When sort=True and the result index is not sortable. - InvalidIndexError - When verify_inputs=True and 1+ of the indexes contain duplicates. + InvalidIndexError: + When there are duplicates in at least one of the indexes (col) + and they are not allowed. + TypeError: + When sort=True and the resulting index (col) could not be sorted. """ orig_columns = [df.columns for df in frame_list] - merged_columns = _merge_index_list(orig_columns, verify_inputs, sort) + + kwargs = { + 'verify_dups': verify_inputs, + 'allow_matching_dups': verify_inputs, + 'sort': sort, + } + + try: + merged_columns = _merge_index_list(orig_columns, **kwargs) + except _DuplicatesError: + raise InvalidIndexError("Indexes with duplicates are only allowed" + " when they are the same (a is b).") + except _CannotSortDuplicatesError: + raise InvalidIndexError("When sort=True, indexes with duplicate" + " values are not allowed.") + except _CannotSortError: + raise TypeError("The resulting columns could not be sorted." + " You can try setting sort=False or use" + " compatible index types.") # Because _merge_index_list may infer the index dtype based on values, # we have to provide a workaround to conserve the original dtype. @@ -217,33 +248,64 @@ def _normalize_dataframes(frame_list, verify_inputs=True, sort=False): return [_reindex(df, merged_columns, axis=1) for df in frame_list] -def _merge_index_list(index_list, verify_inputs=True, sort=False): +def _merge_index_list(index_list, + verify_dups=True, + allow_matching_dups=False, + sort=False): """Merge a list of indexes into one big index Parameters ---------- index_list: list of Index objects - verify_inputs: boolean, default True - Verify if the input indexes contain overlapping values. + verify_dups: boolean, default True + Verify if the input indexes contain duplicate values. + allow_matching_dups: boolean, default False + Only relevant when verify_dups=True. Allow duplicate values when all + indexes have the same identity. sort: boolean, default False Order result index. If False, values will come in the order they appear. Raises ------ - CannotSortError + _CannotSortError When sort=True and the result index is not sortable. - InvalidIndexError - When verify_inputs=True and 1+ of the indexes contain duplicates. + _CannotSortDuplicatesError + When sort=True and at least one of the inputs contain duplicate + values. + _DuplicatesError + When verify_dups=True and at least one of the input indexes contain + duplicate values. This is error is not raised if + allow_matching_dups=True and all the indexes have a common identity. """ - if verify_inputs: - if any([ix.has_duplicates for ix in index_list]): - raise InvalidIndexError("Input index has duplicate values") - - result = index_list[0] - for idx in index_list[1:]: + # unique index list (a is b) + uindex_list = com.get_distinct_objs(index_list) + + # verify duplicates + if sort or verify_dups: + has_dups = any(ix.has_duplicates for ix in uindex_list) + if has_dups: + if sort: + raise _CannotSortDuplicatesError("Cannot sort an index that" + " contains duplicate values.") + elif verify_dups and not allow_matching_dups: + raise _DuplicatesError("Index has duplicate values.") + elif verify_dups and allow_matching_dups and len(uindex_list) >= 2: + raise _DuplicatesError("Index has duplicate values and does" + " not match other indexes.") + + # edge results + if len(uindex_list) == 0: + return pd.Index() + elif len(uindex_list) == 1: + return uindex_list[0] + + # reduce to one result + result = uindex_list[0] + for idx in uindex_list[1:]: result = _merge_indexes(result, idx) + # sort return result if not sort else _sort_index(result) @@ -278,7 +340,7 @@ def _sort_index(index): try: return index.sort_values() except TypeError: - raise CannotSortError + raise _CannotSortError def _reindex(df, new_index, axis=0): diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 40ce8faeba6bb..b2b6e02e908c5 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -158,10 +158,6 @@ def index_arithmetic_method(self, other): return set_function_name(index_arithmetic_method, name, cls) -class CannotSortError(Exception): - pass - - class InvalidIndexError(Exception): pass diff --git a/pandas/tests/reshape/test_append.py b/pandas/tests/reshape/test_append.py index 33af7231471c0..bb9c864d81353 100644 --- a/pandas/tests/reshape/test_append.py +++ b/pandas/tests/reshape/test_append.py @@ -39,6 +39,36 @@ ] +indexes_with_dups = [ + # base + pd.Index(['A', 'B', 'B']), + pd.Index(['B', 'B', 'A']), + pd.Index(['A', 'B', 'B'], name='foo'), + pd.Index(['B', 'B', 'A'], name='bar'), + + # numeric + pd.Index([9, 10, 10], dtype=object), + pd.Int64Index([3, 4, 4]), + pd.UInt64Index([6, 7, 7]), + pd.Float64Index([3.5, 4.5, 4.5]), + + # datetime + pd.to_datetime(['2013-01-01', '2013-01-10', '2013-01-10']), + pd.to_timedelta(['1 day', '2 days', '2 days']), + pd.PeriodIndex([2000, 2001, 2001], freq='A'), + + # interval + pd.IntervalIndex.from_arrays([0, 1, 1], [1, 2, 2]), + + # categorical + pd.CategoricalIndex('A B B'.split()), + pd.CategoricalIndex('D E E'.split(), ordered=True), + + # multi-index + pd.MultiIndex.from_arrays(['A B B'.split(), 'D E E'.split()]), +] + + index_sort_groups = [ # When indexes from the same group are joined, the result is sortable. # When indexes from different groups are joined, the result is not @@ -403,39 +433,90 @@ def test_preserve_index_values_with_sort(self, index1, index2): for value in index2: assert value in result.columns - def test_raise_on_duplicates(self, sort): - # Append should not allow DataFrames with repeated - # column names (or series with repeated row names). - - # dupe on base - df1 = pd.DataFrame([[1, 2, 3]], columns=['A', 'B', 'B']) - df2 = pd.DataFrame([[1, 2, 3]], columns=['A', 'B', 'C']) - with pytest.raises(InvalidIndexError): - df1.append([], sort=sort) - with pytest.raises(InvalidIndexError): - df1.append([df2], sort=sort) - with pytest.raises(InvalidIndexError): - df1.append([df2, df2], sort=sort) - - # dupe on other - df1 = pd.DataFrame([[1, 2, 3]], columns=['A', 'B', 'C']) - df2 = pd.DataFrame([[1, 2, 3]], columns=['A', 'B', 'B']) - with pytest.raises(InvalidIndexError): - df1.append([df2], sort=sort) - with pytest.raises(InvalidIndexError): - df1.append([df2, df2], sort=sort) - - # dupe on both - # (we could avoid raising errors here, but, to keep the api - # consistent, we don't) - df1 = pd.DataFrame([[1, 2, 3]], columns=['A', 'B', 'B']) - df2 = pd.DataFrame([[1, 2, 3]], columns=['A', 'B', 'B']) - with pytest.raises(InvalidIndexError): - df1.append([], sort=sort) - with pytest.raises(InvalidIndexError): - df1.append([df2], sort=sort) - with pytest.raises(InvalidIndexError): - df1.append([df2, df2], sort=sort) + @pytest.mark.parametrize('col_index', indexes_with_dups, ids=cls_name) + def test_good_duplicates_without_sort(self, col_index): + # When all indexes have the same identity (a is b), duplicates should + # be allowed and append works. + + df1 = pd.DataFrame([[1, 2, 3]], columns=col_index) + df2 = pd.DataFrame([[4, 5, 6]], columns=col_index) + + # df1.append([]) + result = df1.append([], sort=False) + expected = df1.copy() + assert_frame_equal(result, expected) + + # df1.append([df2]) + result = df1.append([df2], ignore_index=True, sort=False) + expected = pd.DataFrame([[1, 2, 3], [4, 5, 6]]) + expected.columns = col_index + assert_frame_equal(result, expected) + + # df1.append([df2, df2]) + result = df1.append([df2, df2], ignore_index=True, sort=False) + expected = pd.DataFrame([[1, 2, 3], [4, 5, 6], [4, 5, 6]]) + expected.columns = col_index + assert_frame_equal(result, expected) + + # df2.append([]) + result = df2.append([], sort=False) + expected = df2.copy() + assert_frame_equal(result, expected) + + # df2.append([df1]) + result = df2.append([df1], ignore_index=True, sort=False) + expected = pd.DataFrame([[4, 5, 6], [1, 2, 3]]) + expected.columns = col_index + assert_frame_equal(result, expected) + + # df2.append([df1, df1]) + result = df2.append([df1, df1], ignore_index=True, sort=False) + expected = pd.DataFrame([[4, 5, 6], [1, 2, 3], [1, 2, 3]]) + expected.columns = col_index + assert_frame_equal(result, expected) + + @pytest.mark.parametrize('col_index', indexes_with_dups, ids=cls_name) + def test_bad_duplicates_without_sort(self, col_index): + # When the indexes do not share a common identity, duplicates are not + # allowed and append raises. + + df1 = pd.DataFrame([[1, 2, 3]], columns=col_index) + df2 = pd.DataFrame([[4, 5, 6]], columns=col_index) + df3 = pd.DataFrame([[7, 8, 9]], columns=col_index.copy()) # different + ctx = pytest.raises(InvalidIndexError, + match=r'Indexes with duplicates.*a is b.*') + with ctx: + result = df1.append([df3], sort=False) + with ctx: + result = df1.append([df2, df3], sort=False) + with ctx: + result = df1.append([df3, df2], sort=False) + with ctx: + result = df1.append([df3, df3], sort=False) + + @pytest.mark.parametrize('col_index', indexes_with_dups, ids=cls_name) + def test_duplicates_with_sort(self, col_index): + # When sort=True, indexes with duplicate values are not be allowed. + + df1 = pd.DataFrame([[1, 2, 3]], columns=col_index) + df2 = pd.DataFrame([[4, 5, 6]], columns=col_index.copy()) + ctx = pytest.raises(InvalidIndexError, + match=r'When sort=True, indexes with dupl.*') + + with ctx: + result = df1.append([], sort=True) + with ctx: + result = df1.append([df1], sort=True) + with ctx: + result = df1.append([df2], sort=True) + with ctx: + result = df1.append([df1, df1], sort=True) + with ctx: + result = df1.append([df1, df2], sort=True) + with ctx: + result = df1.append([df2, df1], sort=True) + with ctx: + result = df1.append([df2, df2], sort=True) def test_nosort_basic(self): # When sort=False, the resulting columns come From b3733685a475df3220f268784c31deb07c391135 Mon Sep 17 00:00:00 2001 From: araraonline Date: Sun, 30 Sep 2018 13:09:50 -0300 Subject: [PATCH 25/30] clean lines --- pandas/core/indexes/api.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/pandas/core/indexes/api.py b/pandas/core/indexes/api.py index f4d67ec6649a4..84eb8762b2ea5 100644 --- a/pandas/core/indexes/api.py +++ b/pandas/core/indexes/api.py @@ -208,14 +208,13 @@ def _normalize_dataframes(frame_list, verify_inputs=True, sort=False): """ orig_columns = [df.columns for df in frame_list] - kwargs = { - 'verify_dups': verify_inputs, - 'allow_matching_dups': verify_inputs, - 'sort': sort, - } - try: - merged_columns = _merge_index_list(orig_columns, **kwargs) + merged_columns = _merge_index_list( + orig_columns, + verify_dups=verify_inputs, + allow_matching_dups=verify_inputs, # same-id indexes allowed + sort=sort + ) except _DuplicatesError: raise InvalidIndexError("Indexes with duplicates are only allowed" " when they are the same (a is b).") From 4b32c64eac6cbc1eaca9936e78041da882fb3e75 Mon Sep 17 00:00:00 2001 From: araraonline Date: Sun, 30 Sep 2018 13:12:21 -0300 Subject: [PATCH 26/30] Fix indexes.api._reindex --- pandas/core/indexes/api.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/indexes/api.py b/pandas/core/indexes/api.py index 84eb8762b2ea5..f9cf339a6affe 100644 --- a/pandas/core/indexes/api.py +++ b/pandas/core/indexes/api.py @@ -359,7 +359,7 @@ def _reindex(df, new_index, axis=0): MultiIndex errors. """ try: - return df.reindex(columns=new_index, copy=False) + return df.reindex(new_index, axis=axis, copy=False) except TypeError: if isinstance(df.columns, ABCIntervalIndex): df.columns = df.columns.astype(object) @@ -367,4 +367,4 @@ def _reindex(df, new_index, axis=0): df.columns = df.columns.values else: raise - return df.reindex(columns=new_index, copy=False) + return df.reindex(new_index, axis=axis, copy=False) From 2617449d00231ed5d2bee7b739685cb007a98014 Mon Sep 17 00:00:00 2001 From: araraonline Date: Sun, 30 Sep 2018 13:13:07 -0300 Subject: [PATCH 27/30] clean test --- pandas/tests/reshape/test_append.py | 17 ----------------- 1 file changed, 17 deletions(-) diff --git a/pandas/tests/reshape/test_append.py b/pandas/tests/reshape/test_append.py index bb9c864d81353..ef9e0d16d259f 100644 --- a/pandas/tests/reshape/test_append.py +++ b/pandas/tests/reshape/test_append.py @@ -458,23 +458,6 @@ def test_good_duplicates_without_sort(self, col_index): expected.columns = col_index assert_frame_equal(result, expected) - # df2.append([]) - result = df2.append([], sort=False) - expected = df2.copy() - assert_frame_equal(result, expected) - - # df2.append([df1]) - result = df2.append([df1], ignore_index=True, sort=False) - expected = pd.DataFrame([[4, 5, 6], [1, 2, 3]]) - expected.columns = col_index - assert_frame_equal(result, expected) - - # df2.append([df1, df1]) - result = df2.append([df1, df1], ignore_index=True, sort=False) - expected = pd.DataFrame([[4, 5, 6], [1, 2, 3], [1, 2, 3]]) - expected.columns = col_index - assert_frame_equal(result, expected) - @pytest.mark.parametrize('col_index', indexes_with_dups, ids=cls_name) def test_bad_duplicates_without_sort(self, col_index): # When the indexes do not share a common identity, duplicates are not From c1e8e0f61dea56c6a5f753260a564f7cd268c694 Mon Sep 17 00:00:00 2001 From: araraonline Date: Mon, 1 Oct 2018 00:33:40 -0300 Subject: [PATCH 28/30] Fix small error: 'pd' is not defined Also trying to use Index without arguments --- pandas/core/indexes/api.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/indexes/api.py b/pandas/core/indexes/api.py index f9cf339a6affe..a0612da7f0af2 100644 --- a/pandas/core/indexes/api.py +++ b/pandas/core/indexes/api.py @@ -295,7 +295,7 @@ def _merge_index_list(index_list, # edge results if len(uindex_list) == 0: - return pd.Index() + return Index([]) elif len(uindex_list) == 1: return uindex_list[0] From a8f03b70339a918503788cf42e53ab45f7bfd43c Mon Sep 17 00:00:00 2001 From: araraonline Date: Mon, 1 Oct 2018 00:41:07 -0300 Subject: [PATCH 29/30] Ignore empty indexes in _merge_index_list This reflects in the columns index of DataFrame.append, it will ignore empty indexes (of dtype object)! Some tests are not passing, but this is due to columns dtypes, not indexes. --- pandas/core/indexes/api.py | 5 +++ pandas/core/indexes/base.py | 3 ++ pandas/tests/reshape/test_append.py | 58 +++++++++++++++++++++++++++++ 3 files changed, 66 insertions(+) diff --git a/pandas/core/indexes/api.py b/pandas/core/indexes/api.py index a0612da7f0af2..eb1748a10197d 100644 --- a/pandas/core/indexes/api.py +++ b/pandas/core/indexes/api.py @@ -276,9 +276,14 @@ def _merge_index_list(index_list, When verify_dups=True and at least one of the input indexes contain duplicate values. This is error is not raised if allow_matching_dups=True and all the indexes have a common identity. + + Notes + ----- + Empty indexes (of object dtype) are forgotten. """ # unique index list (a is b) uindex_list = com.get_distinct_objs(index_list) + uindex_list = [i for i in uindex_list if not i.is_empty()] # verify duplicates if sort or verify_dups: diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index b2b6e02e908c5..49a2ede8beb3a 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -1541,6 +1541,9 @@ def is_unique(self): def has_duplicates(self): return not self.is_unique + def is_empty(self): + return self.inferred_type in ['empty'] + def is_boolean(self): return self.inferred_type in ['boolean'] diff --git a/pandas/tests/reshape/test_append.py b/pandas/tests/reshape/test_append.py index ef9e0d16d259f..17e02fdffcc72 100644 --- a/pandas/tests/reshape/test_append.py +++ b/pandas/tests/reshape/test_append.py @@ -388,6 +388,35 @@ def test_preserve_index_type(self, sort, index): expected = pd.DataFrame([[1, 2, np.nan], [3, 4, 5]], columns=index) assert_frame_equal(result, expected) + def test_ignore_empty_index_dtype(self, sort): + # When one of the indexes is empty and of object dtype, it should be + # ignored in the result (as empty). + + df1 = pd.DataFrame() + df2 = pd.DataFrame([[11, 12, 13]], columns=[1, 2, 3]) + + result1 = df1.append(df2, sort=sort) + result2 = df2.append(df1, sort=sort) + + expected = df2.copy() + assert_frame_equal(result1, expected) + assert_frame_equal(result2, expected) + + def test_account_empty_index_dtype(self, sort): + # When one of the indexes is empty and of dtype different from object, + # it should not be ignored when calculating the result dtype. + + df1 = pd.DataFrame(columns=pd.Float64Index([])) + df2 = pd.DataFrame([[11, 12, 13]], columns=[1, 2, 3]) + + result1 = df1.append(df2, sort=sort) + result2 = df2.append(df1, sort=sort) + + expected = df2.copy() + expected.columns = [1.0, 2.0, 3.0] + assert_frame_equal(result1, expected) + assert_frame_equal(result2, expected) + @pytest.mark.parametrize('index2', indexes, ids=cls_name) @pytest.mark.parametrize('index1', indexes, ids=cls_name) def test_preserve_index_values_without_sort(self, index1, index2): @@ -751,6 +780,35 @@ def test_preserve_index_name(self, sort, idx_name1, idx_name2, idx_name3): expected.index.name = idx_name1 assert_frame_equal(result, expected) + def test_ignore_empty_index_dtype(self, sort): + # When one of the indexes is empty and of object dtype, it should be + # ignored in the result (as empty). + + df1 = pd.DataFrame() + df2 = pd.DataFrame([[11], [12], [13]], index=[1, 2, 3]) + + result1 = df1.append(df2, sort=sort) + result2 = df2.append(df1, sort=sort) + + expected = df2.copy() + assert_frame_equal(result1, expected) + assert_frame_equal(result2, expected) + + def test_account_empty_index_dtype(self, sort): + # When one of the indexes is empty and of dtype different from object, + # it should not be ignored when calculating the result dtype. + + df1 = pd.DataFrame(index=pd.Float64Index([])) + df2 = pd.DataFrame([[11], [12], [13]], index=[1, 2, 3]) + + result1 = df1.append(df2, sort=sort) + result2 = df2.append(df1, sort=sort) + + expected = df2.copy() + expected.index = [1.0, 2.0, 3.0] + assert_frame_equal(result1, expected) + assert_frame_equal(result2, expected) + @pytest.mark.parametrize('index', indexes, ids=cls_name) def test_preserve_index_type(self, sort, index): # when there's only one index type in the inputs, From 4a5494205c6e6cab06053f175d4bf840f8c73424 Mon Sep 17 00:00:00 2001 From: araraonline Date: Mon, 1 Oct 2018 18:40:18 -0300 Subject: [PATCH 30/30] Implement sort=None behavior (#8) Implement sort=None behavior and regression tests The default value for sort (None) had a complex behavior and this commit aimed to reproduce it. When the defaul valuet change, it will be wise to remove what was added in this commit. Along with some preparation code that was already present in `append` (calculating `_obj_type` and `_item_type`). --- pandas/core/frame.py | 39 +++++++- pandas/tests/reshape/test_append.py | 143 ++++++++++++++++++++++++++++ 2 files changed, 181 insertions(+), 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 0e58a5d1b3591..4db4239ea1b54 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -6423,7 +6423,44 @@ def _append_list_of_frames(self, other, *args, **kwargs): from pandas.core.indexes.api import _normalize_dataframes from pandas.core.reshape.concat import concat - # TODO: sorting behavior when sort=None + # sorting behavior when sort=None + # TODO: remove when kwarg value change + if sort is None: + # stabilish desired behavior + if _obj_type in (dict, Series): + # dict/ser + + sort = False + warn = False + elif _item_type in (dict, Series): + # [dict]/[ser] + + if (self.columns.get_indexer(other[0].columns) >= 0).all(): + # self.columns >= other[0].columns + sort = False + warn = False + else: + sort = True + types = [df.columns.dtype for df in [self] + other] + common = find_common_type(types) + warn = (common == object) + else: + # frame/[frame] + + if all(self.columns.equals(df.columns) for df in other): + # all values the same + sort = False + warn = False + else: + sort = True + types = [df.columns.dtype for df in [self] + other] + common = find_common_type(types) + warn = (common == object) + + # warn if necessary + if warn: + from pandas.core.indexes.api import _sort_msg + warnings.warn(_sort_msg, FutureWarning) # The behavior of concat is a bit problematic as it is. To get around, # we prepare the DataFrames before feeding them into concat. diff --git a/pandas/tests/reshape/test_append.py b/pandas/tests/reshape/test_append.py index 17e02fdffcc72..d2e07ec8d92ff 100644 --- a/pandas/tests/reshape/test_append.py +++ b/pandas/tests/reshape/test_append.py @@ -4,6 +4,7 @@ import pytest import pandas as pd +from pandas import DataFrame, Index, Series from pandas.core.indexes.base import InvalidIndexError from pandas.util.testing import assert_frame_equal @@ -328,6 +329,148 @@ def test_no_unecessary_upcast(self, sort): assert_frame_equal(result, expected) +class TestAppendSortNone(object): + """Regression tests to preserve the behavior of sort=None + """ + + def generate_frames(self, compare, special): + if compare == 'lt': + if special: + df1 = DataFrame([[11, 12]], columns=[2, 1]) + df2 = DataFrame([[13, 14, 15]], columns=[3, 2, 1]) + else: + df1 = DataFrame([[11, 12]], columns=list('ba')) + df2 = DataFrame([[13, 14, 15]], columns=list('cba')) + elif compare == 'eq': + if special: + df1 = DataFrame([[11, 12, 13]], columns=[3, 2, 1]) + df2 = DataFrame([[14, 15, 16]], columns=[3, 2, 1]) + else: + df1 = DataFrame([[11, 12, 13]], columns=list('cba')) + df2 = DataFrame([[14, 15, 16]], columns=list('cba')) + elif compare == 'gt': + if special: + df1 = DataFrame([[11, 12, 13]], columns=[3, 2, 1]) + df2 = DataFrame([[14, 15]], columns=[2, 1]) + else: + df1 = DataFrame([[11, 12, 13]], columns=list('cba')) + df2 = DataFrame([[14, 15]], columns=list('ba')) + elif compare == 'dups': + # special category for duplicates + # assumes compare = 'eq' + if special: + df1 = DataFrame([[11, 12, 13]], columns=[3, 3, 1]) + df2 = DataFrame([[14, 15, 16]], columns=[3, 3, 1]) + else: + df1 = DataFrame([[11, 12, 13]], columns=list('cca')) + df2 = DataFrame([[14, 15, 16]], columns=list('cca')) + + # avoid upcasting problems + df1 = df1.astype('float64') + df2 = df2.astype('float64') + + return df1, df2 + + def merge_indexes(self, idx1, idx2, sort): + len1 = idx1.size + len2 = idx2.size + + if len1 < len2: + # match 'lt' in self.generate_frames + vals1 = idx1.tolist() + vals2 = [idx2.tolist()[0]] + result = Index(vals1 + vals2) + else: + result = idx1.copy() + + return result.sort_values() if sort else result + + def merge_frames(self, df1, df2, sort): + new_index = self.merge_indexes(df1.columns, df2.columns, sort) + df1 = df1.reindex(new_index, axis=1) + df2 = df2.reindex(new_index, axis=1) + + values = np.vstack([df1.values[0, :], df2.values[0, :]]) + result = DataFrame(values, columns=new_index) + return result + + @pytest.mark.parametrize('input_type', ['series', 'dict']) + @pytest.mark.parametrize('special', [True, False]) + @pytest.mark.parametrize('compare', ['lt', 'eq', 'gt', 'dups']) + def test_append_series_dict(self, compare, special, input_type): + # When appending a Series or dict, the resulting columns come unsorted + # and no warning is raised. + + sorts = False + warns = False + + df1, df2 = self.generate_frames(compare, special) + if input_type == 'series': + other = df2.loc[0] + else: + other = df2.loc[0].to_dict() + if compare == 'dups': + return + + ctx = pytest.warns(FutureWarning) if warns else pytest.warns(None) + expected = self.merge_frames(df1, df2, sorts) + with ctx: + result = df1.append(other, ignore_index=True, sort=None) + assert_frame_equal(result, expected) + + @pytest.mark.parametrize('input_type', ['[series]', '[dict]']) + @pytest.mark.parametrize('special', [True, False]) + @pytest.mark.parametrize('compare', ['lt', 'eq', 'gt']) # dups won't work + def test_append_list_of_series_dict(self, compare, special, input_type): + # When appending a list of Series or list of dicts, the behavior is + # as specified below. + + if compare in ('gt', 'eq'): + sorts = False + warns = False + else: + sorts = True + warns = not special + + df1, df2 = self.generate_frames(compare, special) + if input_type == '[series]': + other = [df2.loc[0]] + else: + other = [df2.loc[0].to_dict()] + + ctx = pytest.warns(FutureWarning) if warns else pytest.warns(None) + expected = self.merge_frames(df1, df2, sorts) + with ctx: + result = df1.append(other, ignore_index=True, sort=None) + assert_frame_equal(result, expected) + + @pytest.mark.parametrize('input_type', ['dataframe', '[dataframe]']) + @pytest.mark.parametrize('special', [True, False]) + @pytest.mark.parametrize('compare', ['lt', 'eq', 'gt', 'dups']) + def test_append_dframe_list_of_dframe(self, compare, special, input_type): + # When appenindg a DataFrame of list of DataFrames, the behavior is as + # specified below. + + if compare in ('dups', 'eq'): + sorts = False + warns = False + else: + sorts = True + warns = not special + + df1, df2 = self.generate_frames(compare, special) + if input_type == 'dataframe': + other = df2 + else: + other = [df2] + + ctx = pytest.warns(FutureWarning) if warns else pytest.warns(None) + expected = self.merge_frames(df1, df2, sorts) + with ctx: + result = df1.append(other, ignore_index=True, sort=None) + assert_frame_equal(result, expected) + + class TestAppendColumnsIndex(object): @pytest.mark.parametrize('idx_name3', [None, 'foo', 'bar', 'baz']) @pytest.mark.parametrize('idx_name2', [None, 'foo', 'bar', 'baz'])