diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py index c0e9b89c1877f..a399fa2b68680 100644 --- a/pandas/tests/frame/test_analytics.py +++ b/pandas/tests/frame/test_analytics.py @@ -1542,384 +1542,6 @@ def test_isin_empty_datetimelike(self): result = df1_td.isin(df3) tm.assert_frame_equal(result, expected) - # ---------------------------------------------------------------------- - # Row deduplication - - def test_drop_duplicates(self): - df = DataFrame({'AAA': ['foo', 'bar', 'foo', 'bar', - 'foo', 'bar', 'bar', 'foo'], - 'B': ['one', 'one', 'two', 'two', - 'two', 'two', 'one', 'two'], - 'C': [1, 1, 2, 2, 2, 2, 1, 2], - 'D': lrange(8)}) - - # single column - result = df.drop_duplicates('AAA') - expected = df[:2] - tm.assert_frame_equal(result, expected) - - result = df.drop_duplicates('AAA', keep='last') - expected = df.loc[[6, 7]] - tm.assert_frame_equal(result, expected) - - result = df.drop_duplicates('AAA', keep=False) - expected = df.loc[[]] - tm.assert_frame_equal(result, expected) - assert len(result) == 0 - - # multi column - expected = df.loc[[0, 1, 2, 3]] - result = df.drop_duplicates(np.array(['AAA', 'B'])) - tm.assert_frame_equal(result, expected) - result = df.drop_duplicates(['AAA', 'B']) - tm.assert_frame_equal(result, expected) - - result = df.drop_duplicates(('AAA', 'B'), keep='last') - expected = df.loc[[0, 5, 6, 7]] - tm.assert_frame_equal(result, expected) - - result = df.drop_duplicates(('AAA', 'B'), keep=False) - expected = df.loc[[0]] - tm.assert_frame_equal(result, expected) - - # consider everything - df2 = df.loc[:, ['AAA', 'B', 'C']] - - result = df2.drop_duplicates() - # in this case only - expected = df2.drop_duplicates(['AAA', 'B']) - tm.assert_frame_equal(result, expected) - - result = df2.drop_duplicates(keep='last') - expected = df2.drop_duplicates(['AAA', 'B'], keep='last') - tm.assert_frame_equal(result, expected) - - result = df2.drop_duplicates(keep=False) - expected = df2.drop_duplicates(['AAA', 'B'], keep=False) - tm.assert_frame_equal(result, expected) - - # integers - result = df.drop_duplicates('C') - expected = df.iloc[[0, 2]] - tm.assert_frame_equal(result, expected) - result = df.drop_duplicates('C', keep='last') - expected = df.iloc[[-2, -1]] - tm.assert_frame_equal(result, expected) - - df['E'] = df['C'].astype('int8') - result = df.drop_duplicates('E') - expected = df.iloc[[0, 2]] - tm.assert_frame_equal(result, expected) - result = df.drop_duplicates('E', keep='last') - expected = df.iloc[[-2, -1]] - tm.assert_frame_equal(result, expected) - - # GH 11376 - df = pd.DataFrame({'x': [7, 6, 3, 3, 4, 8, 0], - 'y': [0, 6, 5, 5, 9, 1, 2]}) - expected = df.loc[df.index != 3] - tm.assert_frame_equal(df.drop_duplicates(), expected) - - df = pd.DataFrame([[1, 0], [0, 2]]) - tm.assert_frame_equal(df.drop_duplicates(), df) - - df = pd.DataFrame([[-2, 0], [0, -4]]) - tm.assert_frame_equal(df.drop_duplicates(), df) - - x = np.iinfo(np.int64).max / 3 * 2 - df = pd.DataFrame([[-x, x], [0, x + 4]]) - tm.assert_frame_equal(df.drop_duplicates(), df) - - df = pd.DataFrame([[-x, x], [x, x + 4]]) - tm.assert_frame_equal(df.drop_duplicates(), df) - - # GH 11864 - df = pd.DataFrame([i] * 9 for i in range(16)) - df = df.append([[1] + [0] * 8], ignore_index=True) - - for keep in ['first', 'last', False]: - assert df.duplicated(keep=keep).sum() == 0 - - @pytest.mark.parametrize('subset', ['a', ['a'], ['a', 'B']]) - def test_duplicated_with_misspelled_column_name(self, subset): - # GH 19730 - df = pd.DataFrame({'A': [0, 0, 1], - 'B': [0, 0, 1], - 'C': [0, 0, 1]}) - - with pytest.raises(KeyError): - df.duplicated(subset) - - with pytest.raises(KeyError): - df.drop_duplicates(subset) - - @pytest.mark.slow - def test_duplicated_do_not_fail_on_wide_dataframes(self): - # gh-21524 - # Given the wide dataframe with a lot of columns - # with different (important!) values - data = {'col_{0:02d}'.format(i): np.random.randint(0, 1000, 30000) - for i in range(100)} - df = pd.DataFrame(data).T - result = df.duplicated() - - # Then duplicates produce the bool pd.Series as a result - # and don't fail during calculation. - # Actual values doesn't matter here, though usually - # it's all False in this case - assert isinstance(result, pd.Series) - assert result.dtype == np.bool - - def test_drop_duplicates_with_duplicate_column_names(self): - # GH17836 - df = DataFrame([ - [1, 2, 5], - [3, 4, 6], - [3, 4, 7] - ], columns=['a', 'a', 'b']) - - result0 = df.drop_duplicates() - tm.assert_frame_equal(result0, df) - - result1 = df.drop_duplicates('a') - expected1 = df[:2] - tm.assert_frame_equal(result1, expected1) - - def test_drop_duplicates_for_take_all(self): - df = DataFrame({'AAA': ['foo', 'bar', 'baz', 'bar', - 'foo', 'bar', 'qux', 'foo'], - 'B': ['one', 'one', 'two', 'two', - 'two', 'two', 'one', 'two'], - 'C': [1, 1, 2, 2, 2, 2, 1, 2], - 'D': lrange(8)}) - - # single column - result = df.drop_duplicates('AAA') - expected = df.iloc[[0, 1, 2, 6]] - tm.assert_frame_equal(result, expected) - - result = df.drop_duplicates('AAA', keep='last') - expected = df.iloc[[2, 5, 6, 7]] - tm.assert_frame_equal(result, expected) - - result = df.drop_duplicates('AAA', keep=False) - expected = df.iloc[[2, 6]] - tm.assert_frame_equal(result, expected) - - # multiple columns - result = df.drop_duplicates(['AAA', 'B']) - expected = df.iloc[[0, 1, 2, 3, 4, 6]] - tm.assert_frame_equal(result, expected) - - result = df.drop_duplicates(['AAA', 'B'], keep='last') - expected = df.iloc[[0, 1, 2, 5, 6, 7]] - tm.assert_frame_equal(result, expected) - - result = df.drop_duplicates(['AAA', 'B'], keep=False) - expected = df.iloc[[0, 1, 2, 6]] - tm.assert_frame_equal(result, expected) - - def test_drop_duplicates_tuple(self): - df = DataFrame({('AA', 'AB'): ['foo', 'bar', 'foo', 'bar', - 'foo', 'bar', 'bar', 'foo'], - 'B': ['one', 'one', 'two', 'two', - 'two', 'two', 'one', 'two'], - 'C': [1, 1, 2, 2, 2, 2, 1, 2], - 'D': lrange(8)}) - - # single column - result = df.drop_duplicates(('AA', 'AB')) - expected = df[:2] - tm.assert_frame_equal(result, expected) - - result = df.drop_duplicates(('AA', 'AB'), keep='last') - expected = df.loc[[6, 7]] - tm.assert_frame_equal(result, expected) - - result = df.drop_duplicates(('AA', 'AB'), keep=False) - expected = df.loc[[]] # empty df - assert len(result) == 0 - tm.assert_frame_equal(result, expected) - - # multi column - expected = df.loc[[0, 1, 2, 3]] - result = df.drop_duplicates((('AA', 'AB'), 'B')) - tm.assert_frame_equal(result, expected) - - def test_drop_duplicates_NA(self): - # none - df = DataFrame({'A': [None, None, 'foo', 'bar', - 'foo', 'bar', 'bar', 'foo'], - 'B': ['one', 'one', 'two', 'two', - 'two', 'two', 'one', 'two'], - 'C': [1.0, np.nan, np.nan, np.nan, 1., 1., 1, 1.], - 'D': lrange(8)}) - - # single column - result = df.drop_duplicates('A') - expected = df.loc[[0, 2, 3]] - tm.assert_frame_equal(result, expected) - - result = df.drop_duplicates('A', keep='last') - expected = df.loc[[1, 6, 7]] - tm.assert_frame_equal(result, expected) - - result = df.drop_duplicates('A', keep=False) - expected = df.loc[[]] # empty df - tm.assert_frame_equal(result, expected) - assert len(result) == 0 - - # multi column - result = df.drop_duplicates(['A', 'B']) - expected = df.loc[[0, 2, 3, 6]] - tm.assert_frame_equal(result, expected) - - result = df.drop_duplicates(['A', 'B'], keep='last') - expected = df.loc[[1, 5, 6, 7]] - tm.assert_frame_equal(result, expected) - - result = df.drop_duplicates(['A', 'B'], keep=False) - expected = df.loc[[6]] - tm.assert_frame_equal(result, expected) - - # nan - df = DataFrame({'A': ['foo', 'bar', 'foo', 'bar', - 'foo', 'bar', 'bar', 'foo'], - 'B': ['one', 'one', 'two', 'two', - 'two', 'two', 'one', 'two'], - 'C': [1.0, np.nan, np.nan, np.nan, 1., 1., 1, 1.], - 'D': lrange(8)}) - - # single column - result = df.drop_duplicates('C') - expected = df[:2] - tm.assert_frame_equal(result, expected) - - result = df.drop_duplicates('C', keep='last') - expected = df.loc[[3, 7]] - tm.assert_frame_equal(result, expected) - - result = df.drop_duplicates('C', keep=False) - expected = df.loc[[]] # empty df - tm.assert_frame_equal(result, expected) - assert len(result) == 0 - - # multi column - result = df.drop_duplicates(['C', 'B']) - expected = df.loc[[0, 1, 2, 4]] - tm.assert_frame_equal(result, expected) - - result = df.drop_duplicates(['C', 'B'], keep='last') - expected = df.loc[[1, 3, 6, 7]] - tm.assert_frame_equal(result, expected) - - result = df.drop_duplicates(['C', 'B'], keep=False) - expected = df.loc[[1]] - tm.assert_frame_equal(result, expected) - - def test_drop_duplicates_NA_for_take_all(self): - # none - df = DataFrame({'A': [None, None, 'foo', 'bar', - 'foo', 'baz', 'bar', 'qux'], - 'C': [1.0, np.nan, np.nan, np.nan, 1., 2., 3, 1.]}) - - # single column - result = df.drop_duplicates('A') - expected = df.iloc[[0, 2, 3, 5, 7]] - tm.assert_frame_equal(result, expected) - - result = df.drop_duplicates('A', keep='last') - expected = df.iloc[[1, 4, 5, 6, 7]] - tm.assert_frame_equal(result, expected) - - result = df.drop_duplicates('A', keep=False) - expected = df.iloc[[5, 7]] - tm.assert_frame_equal(result, expected) - - # nan - - # single column - result = df.drop_duplicates('C') - expected = df.iloc[[0, 1, 5, 6]] - tm.assert_frame_equal(result, expected) - - result = df.drop_duplicates('C', keep='last') - expected = df.iloc[[3, 5, 6, 7]] - tm.assert_frame_equal(result, expected) - - result = df.drop_duplicates('C', keep=False) - expected = df.iloc[[5, 6]] - tm.assert_frame_equal(result, expected) - - def test_drop_duplicates_inplace(self): - orig = DataFrame({'A': ['foo', 'bar', 'foo', 'bar', - 'foo', 'bar', 'bar', 'foo'], - 'B': ['one', 'one', 'two', 'two', - 'two', 'two', 'one', 'two'], - 'C': [1, 1, 2, 2, 2, 2, 1, 2], - 'D': lrange(8)}) - - # single column - df = orig.copy() - df.drop_duplicates('A', inplace=True) - expected = orig[:2] - result = df - tm.assert_frame_equal(result, expected) - - df = orig.copy() - df.drop_duplicates('A', keep='last', inplace=True) - expected = orig.loc[[6, 7]] - result = df - tm.assert_frame_equal(result, expected) - - df = orig.copy() - df.drop_duplicates('A', keep=False, inplace=True) - expected = orig.loc[[]] - result = df - tm.assert_frame_equal(result, expected) - assert len(df) == 0 - - # multi column - df = orig.copy() - df.drop_duplicates(['A', 'B'], inplace=True) - expected = orig.loc[[0, 1, 2, 3]] - result = df - tm.assert_frame_equal(result, expected) - - df = orig.copy() - df.drop_duplicates(['A', 'B'], keep='last', inplace=True) - expected = orig.loc[[0, 5, 6, 7]] - result = df - tm.assert_frame_equal(result, expected) - - df = orig.copy() - df.drop_duplicates(['A', 'B'], keep=False, inplace=True) - expected = orig.loc[[0]] - result = df - tm.assert_frame_equal(result, expected) - - # consider everything - orig2 = orig.loc[:, ['A', 'B', 'C']].copy() - - df2 = orig2.copy() - df2.drop_duplicates(inplace=True) - # in this case only - expected = orig2.drop_duplicates(['A', 'B']) - result = df2 - tm.assert_frame_equal(result, expected) - - df2 = orig2.copy() - df2.drop_duplicates(keep='last', inplace=True) - expected = orig2.drop_duplicates(['A', 'B'], keep='last') - result = df2 - tm.assert_frame_equal(result, expected) - - df2 = orig2.copy() - df2.drop_duplicates(keep=False, inplace=True) - expected = orig2.drop_duplicates(['A', 'B'], keep=False) - result = df2 - tm.assert_frame_equal(result, expected) - # Rounding def test_round(self): # GH 2665 diff --git a/pandas/tests/frame/test_duplicates.py b/pandas/tests/frame/test_duplicates.py new file mode 100644 index 0000000000000..289170527dea7 --- /dev/null +++ b/pandas/tests/frame/test_duplicates.py @@ -0,0 +1,439 @@ +# -*- coding: utf-8 -*- + +from __future__ import print_function + +import pytest + +import numpy as np + +from pandas.compat import lrange, string_types +from pandas import DataFrame, Series + +import pandas.util.testing as tm + + +@pytest.mark.parametrize('subset', ['a', ['a'], ['a', 'B']]) +def test_duplicated_with_misspelled_column_name(subset): + # GH 19730 + df = DataFrame({'A': [0, 0, 1], + 'B': [0, 0, 1], + 'C': [0, 0, 1]}) + + with pytest.raises(KeyError): + df.duplicated(subset) + + with pytest.raises(KeyError): + df.drop_duplicates(subset) + + +@pytest.mark.slow +def test_duplicated_do_not_fail_on_wide_dataframes(): + # gh-21524 + # Given the wide dataframe with a lot of columns + # with different (important!) values + data = {'col_{0:02d}'.format(i): np.random.randint(0, 1000, 30000) + for i in range(100)} + df = DataFrame(data).T + result = df.duplicated() + + # Then duplicates produce the bool Series as a result and don't fail during + # calculation. Actual values doesn't matter here, though usually it's all + # False in this case + assert isinstance(result, Series) + assert result.dtype == np.bool + + +@pytest.mark.parametrize('keep, expected', [ + ('first', Series([False, False, True, False, True])), + ('last', Series([True, True, False, False, False])), + (False, Series([True, True, True, False, True])) +]) +def test_duplicated_keep(keep, expected): + df = DataFrame({'A': [0, 1, 1, 2, 0], 'B': ['a', 'b', 'b', 'c', 'a']}) + + result = df.duplicated(keep=keep) + tm.assert_series_equal(result, expected) + + +@pytest.mark.xfail(reason="GH21720; nan/None falsely considered equal") +@pytest.mark.parametrize('keep, expected', [ + ('first', Series([False, False, True, False, True])), + ('last', Series([True, True, False, False, False])), + (False, Series([True, True, True, False, True])) +]) +def test_duplicated_nan_none(keep, expected): + df = DataFrame({'C': [np.nan, 3, 3, None, np.nan]}, dtype=object) + + result = df.duplicated(keep=keep) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize('keep', ['first', 'last', False]) +@pytest.mark.parametrize('subset', [None, ['A', 'B'], 'A']) +def test_duplicated_subset(subset, keep): + df = DataFrame({'A': [0, 1, 1, 2, 0], + 'B': ['a', 'b', 'b', 'c', 'a'], + 'C': [np.nan, 3, 3, None, np.nan]}) + + if subset is None: + subset = list(df.columns) + elif isinstance(subset, string_types): + # need to have a DataFrame, not a Series + # -> select columns with singleton list, not string + subset = [subset] + + expected = df[subset].duplicated(keep=keep) + result = df.duplicated(keep=keep, subset=subset) + tm.assert_series_equal(result, expected) + + +def test_drop_duplicates(): + df = DataFrame({'AAA': ['foo', 'bar', 'foo', 'bar', + 'foo', 'bar', 'bar', 'foo'], + 'B': ['one', 'one', 'two', 'two', + 'two', 'two', 'one', 'two'], + 'C': [1, 1, 2, 2, 2, 2, 1, 2], + 'D': lrange(8)}) + + # single column + result = df.drop_duplicates('AAA') + expected = df[:2] + tm.assert_frame_equal(result, expected) + + result = df.drop_duplicates('AAA', keep='last') + expected = df.loc[[6, 7]] + tm.assert_frame_equal(result, expected) + + result = df.drop_duplicates('AAA', keep=False) + expected = df.loc[[]] + tm.assert_frame_equal(result, expected) + assert len(result) == 0 + + # multi column + expected = df.loc[[0, 1, 2, 3]] + result = df.drop_duplicates(np.array(['AAA', 'B'])) + tm.assert_frame_equal(result, expected) + result = df.drop_duplicates(['AAA', 'B']) + tm.assert_frame_equal(result, expected) + + result = df.drop_duplicates(('AAA', 'B'), keep='last') + expected = df.loc[[0, 5, 6, 7]] + tm.assert_frame_equal(result, expected) + + result = df.drop_duplicates(('AAA', 'B'), keep=False) + expected = df.loc[[0]] + tm.assert_frame_equal(result, expected) + + # consider everything + df2 = df.loc[:, ['AAA', 'B', 'C']] + + result = df2.drop_duplicates() + # in this case only + expected = df2.drop_duplicates(['AAA', 'B']) + tm.assert_frame_equal(result, expected) + + result = df2.drop_duplicates(keep='last') + expected = df2.drop_duplicates(['AAA', 'B'], keep='last') + tm.assert_frame_equal(result, expected) + + result = df2.drop_duplicates(keep=False) + expected = df2.drop_duplicates(['AAA', 'B'], keep=False) + tm.assert_frame_equal(result, expected) + + # integers + result = df.drop_duplicates('C') + expected = df.iloc[[0, 2]] + tm.assert_frame_equal(result, expected) + result = df.drop_duplicates('C', keep='last') + expected = df.iloc[[-2, -1]] + tm.assert_frame_equal(result, expected) + + df['E'] = df['C'].astype('int8') + result = df.drop_duplicates('E') + expected = df.iloc[[0, 2]] + tm.assert_frame_equal(result, expected) + result = df.drop_duplicates('E', keep='last') + expected = df.iloc[[-2, -1]] + tm.assert_frame_equal(result, expected) + + # GH 11376 + df = DataFrame({'x': [7, 6, 3, 3, 4, 8, 0], + 'y': [0, 6, 5, 5, 9, 1, 2]}) + expected = df.loc[df.index != 3] + tm.assert_frame_equal(df.drop_duplicates(), expected) + + df = DataFrame([[1, 0], [0, 2]]) + tm.assert_frame_equal(df.drop_duplicates(), df) + + df = DataFrame([[-2, 0], [0, -4]]) + tm.assert_frame_equal(df.drop_duplicates(), df) + + x = np.iinfo(np.int64).max / 3 * 2 + df = DataFrame([[-x, x], [0, x + 4]]) + tm.assert_frame_equal(df.drop_duplicates(), df) + + df = DataFrame([[-x, x], [x, x + 4]]) + tm.assert_frame_equal(df.drop_duplicates(), df) + + # GH 11864 + df = DataFrame([i] * 9 for i in range(16)) + df = df.append([[1] + [0] * 8], ignore_index=True) + + for keep in ['first', 'last', False]: + assert df.duplicated(keep=keep).sum() == 0 + + +def test_drop_duplicates_with_duplicate_column_names(): + # GH17836 + df = DataFrame([ + [1, 2, 5], + [3, 4, 6], + [3, 4, 7] + ], columns=['a', 'a', 'b']) + + result0 = df.drop_duplicates() + tm.assert_frame_equal(result0, df) + + result1 = df.drop_duplicates('a') + expected1 = df[:2] + tm.assert_frame_equal(result1, expected1) + + +def test_drop_duplicates_for_take_all(): + df = DataFrame({'AAA': ['foo', 'bar', 'baz', 'bar', + 'foo', 'bar', 'qux', 'foo'], + 'B': ['one', 'one', 'two', 'two', + 'two', 'two', 'one', 'two'], + 'C': [1, 1, 2, 2, 2, 2, 1, 2], + 'D': lrange(8)}) + + # single column + result = df.drop_duplicates('AAA') + expected = df.iloc[[0, 1, 2, 6]] + tm.assert_frame_equal(result, expected) + + result = df.drop_duplicates('AAA', keep='last') + expected = df.iloc[[2, 5, 6, 7]] + tm.assert_frame_equal(result, expected) + + result = df.drop_duplicates('AAA', keep=False) + expected = df.iloc[[2, 6]] + tm.assert_frame_equal(result, expected) + + # multiple columns + result = df.drop_duplicates(['AAA', 'B']) + expected = df.iloc[[0, 1, 2, 3, 4, 6]] + tm.assert_frame_equal(result, expected) + + result = df.drop_duplicates(['AAA', 'B'], keep='last') + expected = df.iloc[[0, 1, 2, 5, 6, 7]] + tm.assert_frame_equal(result, expected) + + result = df.drop_duplicates(['AAA', 'B'], keep=False) + expected = df.iloc[[0, 1, 2, 6]] + tm.assert_frame_equal(result, expected) + + +def test_drop_duplicates_tuple(): + df = DataFrame({('AA', 'AB'): ['foo', 'bar', 'foo', 'bar', + 'foo', 'bar', 'bar', 'foo'], + 'B': ['one', 'one', 'two', 'two', + 'two', 'two', 'one', 'two'], + 'C': [1, 1, 2, 2, 2, 2, 1, 2], + 'D': lrange(8)}) + + # single column + result = df.drop_duplicates(('AA', 'AB')) + expected = df[:2] + tm.assert_frame_equal(result, expected) + + result = df.drop_duplicates(('AA', 'AB'), keep='last') + expected = df.loc[[6, 7]] + tm.assert_frame_equal(result, expected) + + result = df.drop_duplicates(('AA', 'AB'), keep=False) + expected = df.loc[[]] # empty df + assert len(result) == 0 + tm.assert_frame_equal(result, expected) + + # multi column + expected = df.loc[[0, 1, 2, 3]] + result = df.drop_duplicates((('AA', 'AB'), 'B')) + tm.assert_frame_equal(result, expected) + + +def test_drop_duplicates_NA(): + # none + df = DataFrame({'A': [None, None, 'foo', 'bar', + 'foo', 'bar', 'bar', 'foo'], + 'B': ['one', 'one', 'two', 'two', + 'two', 'two', 'one', 'two'], + 'C': [1.0, np.nan, np.nan, np.nan, 1., 1., 1, 1.], + 'D': lrange(8)}) + + # single column + result = df.drop_duplicates('A') + expected = df.loc[[0, 2, 3]] + tm.assert_frame_equal(result, expected) + + result = df.drop_duplicates('A', keep='last') + expected = df.loc[[1, 6, 7]] + tm.assert_frame_equal(result, expected) + + result = df.drop_duplicates('A', keep=False) + expected = df.loc[[]] # empty df + tm.assert_frame_equal(result, expected) + assert len(result) == 0 + + # multi column + result = df.drop_duplicates(['A', 'B']) + expected = df.loc[[0, 2, 3, 6]] + tm.assert_frame_equal(result, expected) + + result = df.drop_duplicates(['A', 'B'], keep='last') + expected = df.loc[[1, 5, 6, 7]] + tm.assert_frame_equal(result, expected) + + result = df.drop_duplicates(['A', 'B'], keep=False) + expected = df.loc[[6]] + tm.assert_frame_equal(result, expected) + + # nan + df = DataFrame({'A': ['foo', 'bar', 'foo', 'bar', + 'foo', 'bar', 'bar', 'foo'], + 'B': ['one', 'one', 'two', 'two', + 'two', 'two', 'one', 'two'], + 'C': [1.0, np.nan, np.nan, np.nan, 1., 1., 1, 1.], + 'D': lrange(8)}) + + # single column + result = df.drop_duplicates('C') + expected = df[:2] + tm.assert_frame_equal(result, expected) + + result = df.drop_duplicates('C', keep='last') + expected = df.loc[[3, 7]] + tm.assert_frame_equal(result, expected) + + result = df.drop_duplicates('C', keep=False) + expected = df.loc[[]] # empty df + tm.assert_frame_equal(result, expected) + assert len(result) == 0 + + # multi column + result = df.drop_duplicates(['C', 'B']) + expected = df.loc[[0, 1, 2, 4]] + tm.assert_frame_equal(result, expected) + + result = df.drop_duplicates(['C', 'B'], keep='last') + expected = df.loc[[1, 3, 6, 7]] + tm.assert_frame_equal(result, expected) + + result = df.drop_duplicates(['C', 'B'], keep=False) + expected = df.loc[[1]] + tm.assert_frame_equal(result, expected) + + +def test_drop_duplicates_NA_for_take_all(): + # none + df = DataFrame({'A': [None, None, 'foo', 'bar', + 'foo', 'baz', 'bar', 'qux'], + 'C': [1.0, np.nan, np.nan, np.nan, 1., 2., 3, 1.]}) + + # single column + result = df.drop_duplicates('A') + expected = df.iloc[[0, 2, 3, 5, 7]] + tm.assert_frame_equal(result, expected) + + result = df.drop_duplicates('A', keep='last') + expected = df.iloc[[1, 4, 5, 6, 7]] + tm.assert_frame_equal(result, expected) + + result = df.drop_duplicates('A', keep=False) + expected = df.iloc[[5, 7]] + tm.assert_frame_equal(result, expected) + + # nan + + # single column + result = df.drop_duplicates('C') + expected = df.iloc[[0, 1, 5, 6]] + tm.assert_frame_equal(result, expected) + + result = df.drop_duplicates('C', keep='last') + expected = df.iloc[[3, 5, 6, 7]] + tm.assert_frame_equal(result, expected) + + result = df.drop_duplicates('C', keep=False) + expected = df.iloc[[5, 6]] + tm.assert_frame_equal(result, expected) + + +def test_drop_duplicates_inplace(): + orig = DataFrame({'A': ['foo', 'bar', 'foo', 'bar', + 'foo', 'bar', 'bar', 'foo'], + 'B': ['one', 'one', 'two', 'two', + 'two', 'two', 'one', 'two'], + 'C': [1, 1, 2, 2, 2, 2, 1, 2], + 'D': lrange(8)}) + + # single column + df = orig.copy() + df.drop_duplicates('A', inplace=True) + expected = orig[:2] + result = df + tm.assert_frame_equal(result, expected) + + df = orig.copy() + df.drop_duplicates('A', keep='last', inplace=True) + expected = orig.loc[[6, 7]] + result = df + tm.assert_frame_equal(result, expected) + + df = orig.copy() + df.drop_duplicates('A', keep=False, inplace=True) + expected = orig.loc[[]] + result = df + tm.assert_frame_equal(result, expected) + assert len(df) == 0 + + # multi column + df = orig.copy() + df.drop_duplicates(['A', 'B'], inplace=True) + expected = orig.loc[[0, 1, 2, 3]] + result = df + tm.assert_frame_equal(result, expected) + + df = orig.copy() + df.drop_duplicates(['A', 'B'], keep='last', inplace=True) + expected = orig.loc[[0, 5, 6, 7]] + result = df + tm.assert_frame_equal(result, expected) + + df = orig.copy() + df.drop_duplicates(['A', 'B'], keep=False, inplace=True) + expected = orig.loc[[0]] + result = df + tm.assert_frame_equal(result, expected) + + # consider everything + orig2 = orig.loc[:, ['A', 'B', 'C']].copy() + + df2 = orig2.copy() + df2.drop_duplicates(inplace=True) + # in this case only + expected = orig2.drop_duplicates(['A', 'B']) + result = df2 + tm.assert_frame_equal(result, expected) + + df2 = orig2.copy() + df2.drop_duplicates(keep='last', inplace=True) + expected = orig2.drop_duplicates(['A', 'B'], keep='last') + result = df2 + tm.assert_frame_equal(result, expected) + + df2 = orig2.copy() + df2.drop_duplicates(keep=False, inplace=True) + expected = orig2.drop_duplicates(['A', 'B'], keep=False) + result = df2 + tm.assert_frame_equal(result, expected)